Merge trunk version 204345 into gupc branch.
[official-gcc.git] / gcc / config / i386 / i386.c
blobe9b5173305b9b64ba700d5d94588738460f721e9
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "context.h"
65 #include "pass_manager.h"
67 static rtx legitimize_dllimport_symbol (rtx, bool);
68 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
69 static rtx legitimize_pe_coff_symbol (rtx, bool);
71 #ifndef CHECK_STACK_LIMIT
72 #define CHECK_STACK_LIMIT (-1)
73 #endif
75 /* Return index of given mode in mult and division cost tables. */
76 #define MODE_INDEX(mode) \
77 ((mode) == QImode ? 0 \
78 : (mode) == HImode ? 1 \
79 : (mode) == SImode ? 2 \
80 : (mode) == DImode ? 3 \
81 : 4)
83 /* Processor costs (relative to an add) */
84 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
85 #define COSTS_N_BYTES(N) ((N) * 2)
87 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
89 static stringop_algs ix86_size_memcpy[2] = {
90 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
92 static stringop_algs ix86_size_memset[2] = {
93 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
96 const
97 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
98 COSTS_N_BYTES (2), /* cost of an add instruction */
99 COSTS_N_BYTES (3), /* cost of a lea instruction */
100 COSTS_N_BYTES (2), /* variable shift costs */
101 COSTS_N_BYTES (3), /* constant shift costs */
102 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
103 COSTS_N_BYTES (3), /* HI */
104 COSTS_N_BYTES (3), /* SI */
105 COSTS_N_BYTES (3), /* DI */
106 COSTS_N_BYTES (5)}, /* other */
107 0, /* cost of multiply per each bit set */
108 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
109 COSTS_N_BYTES (3), /* HI */
110 COSTS_N_BYTES (3), /* SI */
111 COSTS_N_BYTES (3), /* DI */
112 COSTS_N_BYTES (5)}, /* other */
113 COSTS_N_BYTES (3), /* cost of movsx */
114 COSTS_N_BYTES (3), /* cost of movzx */
115 0, /* "large" insn */
116 2, /* MOVE_RATIO */
117 2, /* cost for loading QImode using movzbl */
118 {2, 2, 2}, /* cost of loading integer registers
119 in QImode, HImode and SImode.
120 Relative to reg-reg move (2). */
121 {2, 2, 2}, /* cost of storing integer registers */
122 2, /* cost of reg,reg fld/fst */
123 {2, 2, 2}, /* cost of loading fp registers
124 in SFmode, DFmode and XFmode */
125 {2, 2, 2}, /* cost of storing fp registers
126 in SFmode, DFmode and XFmode */
127 3, /* cost of moving MMX register */
128 {3, 3}, /* cost of loading MMX registers
129 in SImode and DImode */
130 {3, 3}, /* cost of storing MMX registers
131 in SImode and DImode */
132 3, /* cost of moving SSE register */
133 {3, 3, 3}, /* cost of loading SSE registers
134 in SImode, DImode and TImode */
135 {3, 3, 3}, /* cost of storing SSE registers
136 in SImode, DImode and TImode */
137 3, /* MMX or SSE register to integer */
138 0, /* size of l1 cache */
139 0, /* size of l2 cache */
140 0, /* size of prefetch block */
141 0, /* number of parallel prefetches */
142 2, /* Branch cost */
143 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
144 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
145 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
146 COSTS_N_BYTES (2), /* cost of FABS instruction. */
147 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
148 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
149 ix86_size_memcpy,
150 ix86_size_memset,
151 1, /* scalar_stmt_cost. */
152 1, /* scalar load_cost. */
153 1, /* scalar_store_cost. */
154 1, /* vec_stmt_cost. */
155 1, /* vec_to_scalar_cost. */
156 1, /* scalar_to_vec_cost. */
157 1, /* vec_align_load_cost. */
158 1, /* vec_unalign_load_cost. */
159 1, /* vec_store_cost. */
160 1, /* cond_taken_branch_cost. */
161 1, /* cond_not_taken_branch_cost. */
164 /* Processor costs (relative to an add) */
165 static stringop_algs i386_memcpy[2] = {
166 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
167 DUMMY_STRINGOP_ALGS};
168 static stringop_algs i386_memset[2] = {
169 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
170 DUMMY_STRINGOP_ALGS};
172 static const
173 struct processor_costs i386_cost = { /* 386 specific costs */
174 COSTS_N_INSNS (1), /* cost of an add instruction */
175 COSTS_N_INSNS (1), /* cost of a lea instruction */
176 COSTS_N_INSNS (3), /* variable shift costs */
177 COSTS_N_INSNS (2), /* constant shift costs */
178 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
179 COSTS_N_INSNS (6), /* HI */
180 COSTS_N_INSNS (6), /* SI */
181 COSTS_N_INSNS (6), /* DI */
182 COSTS_N_INSNS (6)}, /* other */
183 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
184 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
185 COSTS_N_INSNS (23), /* HI */
186 COSTS_N_INSNS (23), /* SI */
187 COSTS_N_INSNS (23), /* DI */
188 COSTS_N_INSNS (23)}, /* other */
189 COSTS_N_INSNS (3), /* cost of movsx */
190 COSTS_N_INSNS (2), /* cost of movzx */
191 15, /* "large" insn */
192 3, /* MOVE_RATIO */
193 4, /* cost for loading QImode using movzbl */
194 {2, 4, 2}, /* cost of loading integer registers
195 in QImode, HImode and SImode.
196 Relative to reg-reg move (2). */
197 {2, 4, 2}, /* cost of storing integer registers */
198 2, /* cost of reg,reg fld/fst */
199 {8, 8, 8}, /* cost of loading fp registers
200 in SFmode, DFmode and XFmode */
201 {8, 8, 8}, /* cost of storing fp registers
202 in SFmode, DFmode and XFmode */
203 2, /* cost of moving MMX register */
204 {4, 8}, /* cost of loading MMX registers
205 in SImode and DImode */
206 {4, 8}, /* cost of storing MMX registers
207 in SImode and DImode */
208 2, /* cost of moving SSE register */
209 {4, 8, 16}, /* cost of loading SSE registers
210 in SImode, DImode and TImode */
211 {4, 8, 16}, /* cost of storing SSE registers
212 in SImode, DImode and TImode */
213 3, /* MMX or SSE register to integer */
214 0, /* size of l1 cache */
215 0, /* size of l2 cache */
216 0, /* size of prefetch block */
217 0, /* number of parallel prefetches */
218 1, /* Branch cost */
219 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
220 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
221 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
222 COSTS_N_INSNS (22), /* cost of FABS instruction. */
223 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
224 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
225 i386_memcpy,
226 i386_memset,
227 1, /* scalar_stmt_cost. */
228 1, /* scalar load_cost. */
229 1, /* scalar_store_cost. */
230 1, /* vec_stmt_cost. */
231 1, /* vec_to_scalar_cost. */
232 1, /* scalar_to_vec_cost. */
233 1, /* vec_align_load_cost. */
234 2, /* vec_unalign_load_cost. */
235 1, /* vec_store_cost. */
236 3, /* cond_taken_branch_cost. */
237 1, /* cond_not_taken_branch_cost. */
240 static stringop_algs i486_memcpy[2] = {
241 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
242 DUMMY_STRINGOP_ALGS};
243 static stringop_algs i486_memset[2] = {
244 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
245 DUMMY_STRINGOP_ALGS};
247 static const
248 struct processor_costs i486_cost = { /* 486 specific costs */
249 COSTS_N_INSNS (1), /* cost of an add instruction */
250 COSTS_N_INSNS (1), /* cost of a lea instruction */
251 COSTS_N_INSNS (3), /* variable shift costs */
252 COSTS_N_INSNS (2), /* constant shift costs */
253 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
254 COSTS_N_INSNS (12), /* HI */
255 COSTS_N_INSNS (12), /* SI */
256 COSTS_N_INSNS (12), /* DI */
257 COSTS_N_INSNS (12)}, /* other */
258 1, /* cost of multiply per each bit set */
259 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
260 COSTS_N_INSNS (40), /* HI */
261 COSTS_N_INSNS (40), /* SI */
262 COSTS_N_INSNS (40), /* DI */
263 COSTS_N_INSNS (40)}, /* other */
264 COSTS_N_INSNS (3), /* cost of movsx */
265 COSTS_N_INSNS (2), /* cost of movzx */
266 15, /* "large" insn */
267 3, /* MOVE_RATIO */
268 4, /* cost for loading QImode using movzbl */
269 {2, 4, 2}, /* cost of loading integer registers
270 in QImode, HImode and SImode.
271 Relative to reg-reg move (2). */
272 {2, 4, 2}, /* cost of storing integer registers */
273 2, /* cost of reg,reg fld/fst */
274 {8, 8, 8}, /* cost of loading fp registers
275 in SFmode, DFmode and XFmode */
276 {8, 8, 8}, /* cost of storing fp registers
277 in SFmode, DFmode and XFmode */
278 2, /* cost of moving MMX register */
279 {4, 8}, /* cost of loading MMX registers
280 in SImode and DImode */
281 {4, 8}, /* cost of storing MMX registers
282 in SImode and DImode */
283 2, /* cost of moving SSE register */
284 {4, 8, 16}, /* cost of loading SSE registers
285 in SImode, DImode and TImode */
286 {4, 8, 16}, /* cost of storing SSE registers
287 in SImode, DImode and TImode */
288 3, /* MMX or SSE register to integer */
289 4, /* size of l1 cache. 486 has 8kB cache
290 shared for code and data, so 4kB is
291 not really precise. */
292 4, /* size of l2 cache */
293 0, /* size of prefetch block */
294 0, /* number of parallel prefetches */
295 1, /* Branch cost */
296 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
297 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
298 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
299 COSTS_N_INSNS (3), /* cost of FABS instruction. */
300 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
301 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
302 i486_memcpy,
303 i486_memset,
304 1, /* scalar_stmt_cost. */
305 1, /* scalar load_cost. */
306 1, /* scalar_store_cost. */
307 1, /* vec_stmt_cost. */
308 1, /* vec_to_scalar_cost. */
309 1, /* scalar_to_vec_cost. */
310 1, /* vec_align_load_cost. */
311 2, /* vec_unalign_load_cost. */
312 1, /* vec_store_cost. */
313 3, /* cond_taken_branch_cost. */
314 1, /* cond_not_taken_branch_cost. */
317 static stringop_algs pentium_memcpy[2] = {
318 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
319 DUMMY_STRINGOP_ALGS};
320 static stringop_algs pentium_memset[2] = {
321 {libcall, {{-1, rep_prefix_4_byte, false}}},
322 DUMMY_STRINGOP_ALGS};
324 static const
325 struct processor_costs pentium_cost = {
326 COSTS_N_INSNS (1), /* cost of an add instruction */
327 COSTS_N_INSNS (1), /* cost of a lea instruction */
328 COSTS_N_INSNS (4), /* variable shift costs */
329 COSTS_N_INSNS (1), /* constant shift costs */
330 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
331 COSTS_N_INSNS (11), /* HI */
332 COSTS_N_INSNS (11), /* SI */
333 COSTS_N_INSNS (11), /* DI */
334 COSTS_N_INSNS (11)}, /* other */
335 0, /* cost of multiply per each bit set */
336 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
337 COSTS_N_INSNS (25), /* HI */
338 COSTS_N_INSNS (25), /* SI */
339 COSTS_N_INSNS (25), /* DI */
340 COSTS_N_INSNS (25)}, /* other */
341 COSTS_N_INSNS (3), /* cost of movsx */
342 COSTS_N_INSNS (2), /* cost of movzx */
343 8, /* "large" insn */
344 6, /* MOVE_RATIO */
345 6, /* cost for loading QImode using movzbl */
346 {2, 4, 2}, /* cost of loading integer registers
347 in QImode, HImode and SImode.
348 Relative to reg-reg move (2). */
349 {2, 4, 2}, /* cost of storing integer registers */
350 2, /* cost of reg,reg fld/fst */
351 {2, 2, 6}, /* cost of loading fp registers
352 in SFmode, DFmode and XFmode */
353 {4, 4, 6}, /* cost of storing fp registers
354 in SFmode, DFmode and XFmode */
355 8, /* cost of moving MMX register */
356 {8, 8}, /* cost of loading MMX registers
357 in SImode and DImode */
358 {8, 8}, /* cost of storing MMX registers
359 in SImode and DImode */
360 2, /* cost of moving SSE register */
361 {4, 8, 16}, /* cost of loading SSE registers
362 in SImode, DImode and TImode */
363 {4, 8, 16}, /* cost of storing SSE registers
364 in SImode, DImode and TImode */
365 3, /* MMX or SSE register to integer */
366 8, /* size of l1 cache. */
367 8, /* size of l2 cache */
368 0, /* size of prefetch block */
369 0, /* number of parallel prefetches */
370 2, /* Branch cost */
371 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
372 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
373 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
374 COSTS_N_INSNS (1), /* cost of FABS instruction. */
375 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
376 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
377 pentium_memcpy,
378 pentium_memset,
379 1, /* scalar_stmt_cost. */
380 1, /* scalar load_cost. */
381 1, /* scalar_store_cost. */
382 1, /* vec_stmt_cost. */
383 1, /* vec_to_scalar_cost. */
384 1, /* scalar_to_vec_cost. */
385 1, /* vec_align_load_cost. */
386 2, /* vec_unalign_load_cost. */
387 1, /* vec_store_cost. */
388 3, /* cond_taken_branch_cost. */
389 1, /* cond_not_taken_branch_cost. */
392 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
393 (we ensure the alignment). For small blocks inline loop is still a
394 noticeable win, for bigger blocks either rep movsl or rep movsb is
395 way to go. Rep movsb has apparently more expensive startup time in CPU,
396 but after 4K the difference is down in the noise. */
397 static stringop_algs pentiumpro_memcpy[2] = {
398 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
399 {8192, rep_prefix_4_byte, false},
400 {-1, rep_prefix_1_byte, false}}},
401 DUMMY_STRINGOP_ALGS};
402 static stringop_algs pentiumpro_memset[2] = {
403 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
404 {8192, rep_prefix_4_byte, false},
405 {-1, libcall, false}}},
406 DUMMY_STRINGOP_ALGS};
407 static const
408 struct processor_costs pentiumpro_cost = {
409 COSTS_N_INSNS (1), /* cost of an add instruction */
410 COSTS_N_INSNS (1), /* cost of a lea instruction */
411 COSTS_N_INSNS (1), /* variable shift costs */
412 COSTS_N_INSNS (1), /* constant shift costs */
413 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
414 COSTS_N_INSNS (4), /* HI */
415 COSTS_N_INSNS (4), /* SI */
416 COSTS_N_INSNS (4), /* DI */
417 COSTS_N_INSNS (4)}, /* other */
418 0, /* cost of multiply per each bit set */
419 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
420 COSTS_N_INSNS (17), /* HI */
421 COSTS_N_INSNS (17), /* SI */
422 COSTS_N_INSNS (17), /* DI */
423 COSTS_N_INSNS (17)}, /* other */
424 COSTS_N_INSNS (1), /* cost of movsx */
425 COSTS_N_INSNS (1), /* cost of movzx */
426 8, /* "large" insn */
427 6, /* MOVE_RATIO */
428 2, /* cost for loading QImode using movzbl */
429 {4, 4, 4}, /* cost of loading integer registers
430 in QImode, HImode and SImode.
431 Relative to reg-reg move (2). */
432 {2, 2, 2}, /* cost of storing integer registers */
433 2, /* cost of reg,reg fld/fst */
434 {2, 2, 6}, /* cost of loading fp registers
435 in SFmode, DFmode and XFmode */
436 {4, 4, 6}, /* cost of storing fp registers
437 in SFmode, DFmode and XFmode */
438 2, /* cost of moving MMX register */
439 {2, 2}, /* cost of loading MMX registers
440 in SImode and DImode */
441 {2, 2}, /* cost of storing MMX registers
442 in SImode and DImode */
443 2, /* cost of moving SSE register */
444 {2, 2, 8}, /* cost of loading SSE registers
445 in SImode, DImode and TImode */
446 {2, 2, 8}, /* cost of storing SSE registers
447 in SImode, DImode and TImode */
448 3, /* MMX or SSE register to integer */
449 8, /* size of l1 cache. */
450 256, /* size of l2 cache */
451 32, /* size of prefetch block */
452 6, /* number of parallel prefetches */
453 2, /* Branch cost */
454 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
455 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
456 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
457 COSTS_N_INSNS (2), /* cost of FABS instruction. */
458 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
459 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
460 pentiumpro_memcpy,
461 pentiumpro_memset,
462 1, /* scalar_stmt_cost. */
463 1, /* scalar load_cost. */
464 1, /* scalar_store_cost. */
465 1, /* vec_stmt_cost. */
466 1, /* vec_to_scalar_cost. */
467 1, /* scalar_to_vec_cost. */
468 1, /* vec_align_load_cost. */
469 2, /* vec_unalign_load_cost. */
470 1, /* vec_store_cost. */
471 3, /* cond_taken_branch_cost. */
472 1, /* cond_not_taken_branch_cost. */
475 static stringop_algs geode_memcpy[2] = {
476 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
477 DUMMY_STRINGOP_ALGS};
478 static stringop_algs geode_memset[2] = {
479 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
480 DUMMY_STRINGOP_ALGS};
481 static const
482 struct processor_costs geode_cost = {
483 COSTS_N_INSNS (1), /* cost of an add instruction */
484 COSTS_N_INSNS (1), /* cost of a lea instruction */
485 COSTS_N_INSNS (2), /* variable shift costs */
486 COSTS_N_INSNS (1), /* constant shift costs */
487 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
488 COSTS_N_INSNS (4), /* HI */
489 COSTS_N_INSNS (7), /* SI */
490 COSTS_N_INSNS (7), /* DI */
491 COSTS_N_INSNS (7)}, /* other */
492 0, /* cost of multiply per each bit set */
493 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
494 COSTS_N_INSNS (23), /* HI */
495 COSTS_N_INSNS (39), /* SI */
496 COSTS_N_INSNS (39), /* DI */
497 COSTS_N_INSNS (39)}, /* other */
498 COSTS_N_INSNS (1), /* cost of movsx */
499 COSTS_N_INSNS (1), /* cost of movzx */
500 8, /* "large" insn */
501 4, /* MOVE_RATIO */
502 1, /* cost for loading QImode using movzbl */
503 {1, 1, 1}, /* cost of loading integer registers
504 in QImode, HImode and SImode.
505 Relative to reg-reg move (2). */
506 {1, 1, 1}, /* cost of storing integer registers */
507 1, /* cost of reg,reg fld/fst */
508 {1, 1, 1}, /* cost of loading fp registers
509 in SFmode, DFmode and XFmode */
510 {4, 6, 6}, /* cost of storing fp registers
511 in SFmode, DFmode and XFmode */
513 1, /* cost of moving MMX register */
514 {1, 1}, /* cost of loading MMX registers
515 in SImode and DImode */
516 {1, 1}, /* cost of storing MMX registers
517 in SImode and DImode */
518 1, /* cost of moving SSE register */
519 {1, 1, 1}, /* cost of loading SSE registers
520 in SImode, DImode and TImode */
521 {1, 1, 1}, /* cost of storing SSE registers
522 in SImode, DImode and TImode */
523 1, /* MMX or SSE register to integer */
524 64, /* size of l1 cache. */
525 128, /* size of l2 cache. */
526 32, /* size of prefetch block */
527 1, /* number of parallel prefetches */
528 1, /* Branch cost */
529 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
530 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
531 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
532 COSTS_N_INSNS (1), /* cost of FABS instruction. */
533 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
534 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
535 geode_memcpy,
536 geode_memset,
537 1, /* scalar_stmt_cost. */
538 1, /* scalar load_cost. */
539 1, /* scalar_store_cost. */
540 1, /* vec_stmt_cost. */
541 1, /* vec_to_scalar_cost. */
542 1, /* scalar_to_vec_cost. */
543 1, /* vec_align_load_cost. */
544 2, /* vec_unalign_load_cost. */
545 1, /* vec_store_cost. */
546 3, /* cond_taken_branch_cost. */
547 1, /* cond_not_taken_branch_cost. */
550 static stringop_algs k6_memcpy[2] = {
551 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
552 DUMMY_STRINGOP_ALGS};
553 static stringop_algs k6_memset[2] = {
554 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
555 DUMMY_STRINGOP_ALGS};
556 static const
557 struct processor_costs k6_cost = {
558 COSTS_N_INSNS (1), /* cost of an add instruction */
559 COSTS_N_INSNS (2), /* cost of a lea instruction */
560 COSTS_N_INSNS (1), /* variable shift costs */
561 COSTS_N_INSNS (1), /* constant shift costs */
562 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
563 COSTS_N_INSNS (3), /* HI */
564 COSTS_N_INSNS (3), /* SI */
565 COSTS_N_INSNS (3), /* DI */
566 COSTS_N_INSNS (3)}, /* other */
567 0, /* cost of multiply per each bit set */
568 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
569 COSTS_N_INSNS (18), /* HI */
570 COSTS_N_INSNS (18), /* SI */
571 COSTS_N_INSNS (18), /* DI */
572 COSTS_N_INSNS (18)}, /* other */
573 COSTS_N_INSNS (2), /* cost of movsx */
574 COSTS_N_INSNS (2), /* cost of movzx */
575 8, /* "large" insn */
576 4, /* MOVE_RATIO */
577 3, /* cost for loading QImode using movzbl */
578 {4, 5, 4}, /* cost of loading integer registers
579 in QImode, HImode and SImode.
580 Relative to reg-reg move (2). */
581 {2, 3, 2}, /* cost of storing integer registers */
582 4, /* cost of reg,reg fld/fst */
583 {6, 6, 6}, /* cost of loading fp registers
584 in SFmode, DFmode and XFmode */
585 {4, 4, 4}, /* cost of storing fp registers
586 in SFmode, DFmode and XFmode */
587 2, /* cost of moving MMX register */
588 {2, 2}, /* cost of loading MMX registers
589 in SImode and DImode */
590 {2, 2}, /* cost of storing MMX registers
591 in SImode and DImode */
592 2, /* cost of moving SSE register */
593 {2, 2, 8}, /* cost of loading SSE registers
594 in SImode, DImode and TImode */
595 {2, 2, 8}, /* cost of storing SSE registers
596 in SImode, DImode and TImode */
597 6, /* MMX or SSE register to integer */
598 32, /* size of l1 cache. */
599 32, /* size of l2 cache. Some models
600 have integrated l2 cache, but
601 optimizing for k6 is not important
602 enough to worry about that. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (2), /* cost of FABS instruction. */
610 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
612 k6_memcpy,
613 k6_memset,
614 1, /* scalar_stmt_cost. */
615 1, /* scalar load_cost. */
616 1, /* scalar_store_cost. */
617 1, /* vec_stmt_cost. */
618 1, /* vec_to_scalar_cost. */
619 1, /* scalar_to_vec_cost. */
620 1, /* vec_align_load_cost. */
621 2, /* vec_unalign_load_cost. */
622 1, /* vec_store_cost. */
623 3, /* cond_taken_branch_cost. */
624 1, /* cond_not_taken_branch_cost. */
627 /* For some reason, Athlon deals better with REP prefix (relative to loops)
628 compared to K8. Alignment becomes important after 8 bytes for memcpy and
629 128 bytes for memset. */
630 static stringop_algs athlon_memcpy[2] = {
631 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633 static stringop_algs athlon_memset[2] = {
634 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636 static const
637 struct processor_costs athlon_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (5), /* HI */
644 COSTS_N_INSNS (5), /* SI */
645 COSTS_N_INSNS (5), /* DI */
646 COSTS_N_INSNS (5)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (26), /* HI */
650 COSTS_N_INSNS (42), /* SI */
651 COSTS_N_INSNS (74), /* DI */
652 COSTS_N_INSNS (74)}, /* other */
653 COSTS_N_INSNS (1), /* cost of movsx */
654 COSTS_N_INSNS (1), /* cost of movzx */
655 8, /* "large" insn */
656 9, /* MOVE_RATIO */
657 4, /* cost for loading QImode using movzbl */
658 {3, 4, 3}, /* cost of loading integer registers
659 in QImode, HImode and SImode.
660 Relative to reg-reg move (2). */
661 {3, 4, 3}, /* cost of storing integer registers */
662 4, /* cost of reg,reg fld/fst */
663 {4, 4, 12}, /* cost of loading fp registers
664 in SFmode, DFmode and XFmode */
665 {6, 6, 8}, /* cost of storing fp registers
666 in SFmode, DFmode and XFmode */
667 2, /* cost of moving MMX register */
668 {4, 4}, /* cost of loading MMX registers
669 in SImode and DImode */
670 {4, 4}, /* cost of storing MMX registers
671 in SImode and DImode */
672 2, /* cost of moving SSE register */
673 {4, 4, 6}, /* cost of loading SSE registers
674 in SImode, DImode and TImode */
675 {4, 4, 5}, /* cost of storing SSE registers
676 in SImode, DImode and TImode */
677 5, /* MMX or SSE register to integer */
678 64, /* size of l1 cache. */
679 256, /* size of l2 cache. */
680 64, /* size of prefetch block */
681 6, /* number of parallel prefetches */
682 5, /* Branch cost */
683 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
684 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
685 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
686 COSTS_N_INSNS (2), /* cost of FABS instruction. */
687 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
688 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
689 athlon_memcpy,
690 athlon_memset,
691 1, /* scalar_stmt_cost. */
692 1, /* scalar load_cost. */
693 1, /* scalar_store_cost. */
694 1, /* vec_stmt_cost. */
695 1, /* vec_to_scalar_cost. */
696 1, /* scalar_to_vec_cost. */
697 1, /* vec_align_load_cost. */
698 2, /* vec_unalign_load_cost. */
699 1, /* vec_store_cost. */
700 3, /* cond_taken_branch_cost. */
701 1, /* cond_not_taken_branch_cost. */
704 /* K8 has optimized REP instruction for medium sized blocks, but for very
705 small blocks it is better to use loop. For large blocks, libcall can
706 do nontemporary accesses and beat inline considerably. */
707 static stringop_algs k8_memcpy[2] = {
708 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
709 {-1, rep_prefix_4_byte, false}}},
710 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
711 {-1, libcall, false}}}};
712 static stringop_algs k8_memset[2] = {
713 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
714 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
715 {libcall, {{48, unrolled_loop, false},
716 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
717 static const
718 struct processor_costs k8_cost = {
719 COSTS_N_INSNS (1), /* cost of an add instruction */
720 COSTS_N_INSNS (2), /* cost of a lea instruction */
721 COSTS_N_INSNS (1), /* variable shift costs */
722 COSTS_N_INSNS (1), /* constant shift costs */
723 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
724 COSTS_N_INSNS (4), /* HI */
725 COSTS_N_INSNS (3), /* SI */
726 COSTS_N_INSNS (4), /* DI */
727 COSTS_N_INSNS (5)}, /* other */
728 0, /* cost of multiply per each bit set */
729 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
730 COSTS_N_INSNS (26), /* HI */
731 COSTS_N_INSNS (42), /* SI */
732 COSTS_N_INSNS (74), /* DI */
733 COSTS_N_INSNS (74)}, /* other */
734 COSTS_N_INSNS (1), /* cost of movsx */
735 COSTS_N_INSNS (1), /* cost of movzx */
736 8, /* "large" insn */
737 9, /* MOVE_RATIO */
738 4, /* cost for loading QImode using movzbl */
739 {3, 4, 3}, /* cost of loading integer registers
740 in QImode, HImode and SImode.
741 Relative to reg-reg move (2). */
742 {3, 4, 3}, /* cost of storing integer registers */
743 4, /* cost of reg,reg fld/fst */
744 {4, 4, 12}, /* cost of loading fp registers
745 in SFmode, DFmode and XFmode */
746 {6, 6, 8}, /* cost of storing fp registers
747 in SFmode, DFmode and XFmode */
748 2, /* cost of moving MMX register */
749 {3, 3}, /* cost of loading MMX registers
750 in SImode and DImode */
751 {4, 4}, /* cost of storing MMX registers
752 in SImode and DImode */
753 2, /* cost of moving SSE register */
754 {4, 3, 6}, /* cost of loading SSE registers
755 in SImode, DImode and TImode */
756 {4, 4, 5}, /* cost of storing SSE registers
757 in SImode, DImode and TImode */
758 5, /* MMX or SSE register to integer */
759 64, /* size of l1 cache. */
760 512, /* size of l2 cache. */
761 64, /* size of prefetch block */
762 /* New AMD processors never drop prefetches; if they cannot be performed
763 immediately, they are queued. We set number of simultaneous prefetches
764 to a large constant to reflect this (it probably is not a good idea not
765 to limit number of prefetches at all, as their execution also takes some
766 time). */
767 100, /* number of parallel prefetches */
768 3, /* Branch cost */
769 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
770 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
771 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
772 COSTS_N_INSNS (2), /* cost of FABS instruction. */
773 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
774 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
776 k8_memcpy,
777 k8_memset,
778 4, /* scalar_stmt_cost. */
779 2, /* scalar load_cost. */
780 2, /* scalar_store_cost. */
781 5, /* vec_stmt_cost. */
782 0, /* vec_to_scalar_cost. */
783 2, /* scalar_to_vec_cost. */
784 2, /* vec_align_load_cost. */
785 3, /* vec_unalign_load_cost. */
786 3, /* vec_store_cost. */
787 3, /* cond_taken_branch_cost. */
788 2, /* cond_not_taken_branch_cost. */
791 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
792 very small blocks it is better to use loop. For large blocks, libcall can
793 do nontemporary accesses and beat inline considerably. */
794 static stringop_algs amdfam10_memcpy[2] = {
795 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
796 {-1, rep_prefix_4_byte, false}}},
797 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
798 {-1, libcall, false}}}};
799 static stringop_algs amdfam10_memset[2] = {
800 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
801 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
802 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
803 {-1, libcall, false}}}};
804 struct processor_costs amdfam10_cost = {
805 COSTS_N_INSNS (1), /* cost of an add instruction */
806 COSTS_N_INSNS (2), /* cost of a lea instruction */
807 COSTS_N_INSNS (1), /* variable shift costs */
808 COSTS_N_INSNS (1), /* constant shift costs */
809 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
810 COSTS_N_INSNS (4), /* HI */
811 COSTS_N_INSNS (3), /* SI */
812 COSTS_N_INSNS (4), /* DI */
813 COSTS_N_INSNS (5)}, /* other */
814 0, /* cost of multiply per each bit set */
815 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
816 COSTS_N_INSNS (35), /* HI */
817 COSTS_N_INSNS (51), /* SI */
818 COSTS_N_INSNS (83), /* DI */
819 COSTS_N_INSNS (83)}, /* other */
820 COSTS_N_INSNS (1), /* cost of movsx */
821 COSTS_N_INSNS (1), /* cost of movzx */
822 8, /* "large" insn */
823 9, /* MOVE_RATIO */
824 4, /* cost for loading QImode using movzbl */
825 {3, 4, 3}, /* cost of loading integer registers
826 in QImode, HImode and SImode.
827 Relative to reg-reg move (2). */
828 {3, 4, 3}, /* cost of storing integer registers */
829 4, /* cost of reg,reg fld/fst */
830 {4, 4, 12}, /* cost of loading fp registers
831 in SFmode, DFmode and XFmode */
832 {6, 6, 8}, /* cost of storing fp registers
833 in SFmode, DFmode and XFmode */
834 2, /* cost of moving MMX register */
835 {3, 3}, /* cost of loading MMX registers
836 in SImode and DImode */
837 {4, 4}, /* cost of storing MMX registers
838 in SImode and DImode */
839 2, /* cost of moving SSE register */
840 {4, 4, 3}, /* cost of loading SSE registers
841 in SImode, DImode and TImode */
842 {4, 4, 5}, /* cost of storing SSE registers
843 in SImode, DImode and TImode */
844 3, /* MMX or SSE register to integer */
845 /* On K8:
846 MOVD reg64, xmmreg Double FSTORE 4
847 MOVD reg32, xmmreg Double FSTORE 4
848 On AMDFAM10:
849 MOVD reg64, xmmreg Double FADD 3
850 1/1 1/1
851 MOVD reg32, xmmreg Double FADD 3
852 1/1 1/1 */
853 64, /* size of l1 cache. */
854 512, /* size of l2 cache. */
855 64, /* size of prefetch block */
856 /* New AMD processors never drop prefetches; if they cannot be performed
857 immediately, they are queued. We set number of simultaneous prefetches
858 to a large constant to reflect this (it probably is not a good idea not
859 to limit number of prefetches at all, as their execution also takes some
860 time). */
861 100, /* number of parallel prefetches */
862 2, /* Branch cost */
863 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
864 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
865 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
866 COSTS_N_INSNS (2), /* cost of FABS instruction. */
867 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
868 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
870 amdfam10_memcpy,
871 amdfam10_memset,
872 4, /* scalar_stmt_cost. */
873 2, /* scalar load_cost. */
874 2, /* scalar_store_cost. */
875 6, /* vec_stmt_cost. */
876 0, /* vec_to_scalar_cost. */
877 2, /* scalar_to_vec_cost. */
878 2, /* vec_align_load_cost. */
879 2, /* vec_unalign_load_cost. */
880 2, /* vec_store_cost. */
881 2, /* cond_taken_branch_cost. */
882 1, /* cond_not_taken_branch_cost. */
885 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
886 very small blocks it is better to use loop. For large blocks, libcall
887 can do nontemporary accesses and beat inline considerably. */
888 static stringop_algs bdver1_memcpy[2] = {
889 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
890 {-1, rep_prefix_4_byte, false}}},
891 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
892 {-1, libcall, false}}}};
893 static stringop_algs bdver1_memset[2] = {
894 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
895 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
896 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
899 const struct processor_costs bdver1_cost = {
900 COSTS_N_INSNS (1), /* cost of an add instruction */
901 COSTS_N_INSNS (1), /* cost of a lea instruction */
902 COSTS_N_INSNS (1), /* variable shift costs */
903 COSTS_N_INSNS (1), /* constant shift costs */
904 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
905 COSTS_N_INSNS (4), /* HI */
906 COSTS_N_INSNS (4), /* SI */
907 COSTS_N_INSNS (6), /* DI */
908 COSTS_N_INSNS (6)}, /* other */
909 0, /* cost of multiply per each bit set */
910 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
911 COSTS_N_INSNS (35), /* HI */
912 COSTS_N_INSNS (51), /* SI */
913 COSTS_N_INSNS (83), /* DI */
914 COSTS_N_INSNS (83)}, /* other */
915 COSTS_N_INSNS (1), /* cost of movsx */
916 COSTS_N_INSNS (1), /* cost of movzx */
917 8, /* "large" insn */
918 9, /* MOVE_RATIO */
919 4, /* cost for loading QImode using movzbl */
920 {5, 5, 4}, /* cost of loading integer registers
921 in QImode, HImode and SImode.
922 Relative to reg-reg move (2). */
923 {4, 4, 4}, /* cost of storing integer registers */
924 2, /* cost of reg,reg fld/fst */
925 {5, 5, 12}, /* cost of loading fp registers
926 in SFmode, DFmode and XFmode */
927 {4, 4, 8}, /* cost of storing fp registers
928 in SFmode, DFmode and XFmode */
929 2, /* cost of moving MMX register */
930 {4, 4}, /* cost of loading MMX registers
931 in SImode and DImode */
932 {4, 4}, /* cost of storing MMX registers
933 in SImode and DImode */
934 2, /* cost of moving SSE register */
935 {4, 4, 4}, /* cost of loading SSE registers
936 in SImode, DImode and TImode */
937 {4, 4, 4}, /* cost of storing SSE registers
938 in SImode, DImode and TImode */
939 2, /* MMX or SSE register to integer */
940 /* On K8:
941 MOVD reg64, xmmreg Double FSTORE 4
942 MOVD reg32, xmmreg Double FSTORE 4
943 On AMDFAM10:
944 MOVD reg64, xmmreg Double FADD 3
945 1/1 1/1
946 MOVD reg32, xmmreg Double FADD 3
947 1/1 1/1 */
948 16, /* size of l1 cache. */
949 2048, /* size of l2 cache. */
950 64, /* size of prefetch block */
951 /* New AMD processors never drop prefetches; if they cannot be performed
952 immediately, they are queued. We set number of simultaneous prefetches
953 to a large constant to reflect this (it probably is not a good idea not
954 to limit number of prefetches at all, as their execution also takes some
955 time). */
956 100, /* number of parallel prefetches */
957 2, /* Branch cost */
958 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
959 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
960 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
961 COSTS_N_INSNS (2), /* cost of FABS instruction. */
962 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
963 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
965 bdver1_memcpy,
966 bdver1_memset,
967 6, /* scalar_stmt_cost. */
968 4, /* scalar load_cost. */
969 4, /* scalar_store_cost. */
970 6, /* vec_stmt_cost. */
971 0, /* vec_to_scalar_cost. */
972 2, /* scalar_to_vec_cost. */
973 4, /* vec_align_load_cost. */
974 4, /* vec_unalign_load_cost. */
975 4, /* vec_store_cost. */
976 2, /* cond_taken_branch_cost. */
977 1, /* cond_not_taken_branch_cost. */
980 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
981 very small blocks it is better to use loop. For large blocks, libcall
982 can do nontemporary accesses and beat inline considerably. */
984 static stringop_algs bdver2_memcpy[2] = {
985 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
986 {-1, rep_prefix_4_byte, false}}},
987 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
988 {-1, libcall, false}}}};
989 static stringop_algs bdver2_memset[2] = {
990 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
991 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
992 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
993 {-1, libcall, false}}}};
995 const struct processor_costs bdver2_cost = {
996 COSTS_N_INSNS (1), /* cost of an add instruction */
997 COSTS_N_INSNS (1), /* cost of a lea instruction */
998 COSTS_N_INSNS (1), /* variable shift costs */
999 COSTS_N_INSNS (1), /* constant shift costs */
1000 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1001 COSTS_N_INSNS (4), /* HI */
1002 COSTS_N_INSNS (4), /* SI */
1003 COSTS_N_INSNS (6), /* DI */
1004 COSTS_N_INSNS (6)}, /* other */
1005 0, /* cost of multiply per each bit set */
1006 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1007 COSTS_N_INSNS (35), /* HI */
1008 COSTS_N_INSNS (51), /* SI */
1009 COSTS_N_INSNS (83), /* DI */
1010 COSTS_N_INSNS (83)}, /* other */
1011 COSTS_N_INSNS (1), /* cost of movsx */
1012 COSTS_N_INSNS (1), /* cost of movzx */
1013 8, /* "large" insn */
1014 9, /* MOVE_RATIO */
1015 4, /* cost for loading QImode using movzbl */
1016 {5, 5, 4}, /* cost of loading integer registers
1017 in QImode, HImode and SImode.
1018 Relative to reg-reg move (2). */
1019 {4, 4, 4}, /* cost of storing integer registers */
1020 2, /* cost of reg,reg fld/fst */
1021 {5, 5, 12}, /* cost of loading fp registers
1022 in SFmode, DFmode and XFmode */
1023 {4, 4, 8}, /* cost of storing fp registers
1024 in SFmode, DFmode and XFmode */
1025 2, /* cost of moving MMX register */
1026 {4, 4}, /* cost of loading MMX registers
1027 in SImode and DImode */
1028 {4, 4}, /* cost of storing MMX registers
1029 in SImode and DImode */
1030 2, /* cost of moving SSE register */
1031 {4, 4, 4}, /* cost of loading SSE registers
1032 in SImode, DImode and TImode */
1033 {4, 4, 4}, /* cost of storing SSE registers
1034 in SImode, DImode and TImode */
1035 2, /* MMX or SSE register to integer */
1036 /* On K8:
1037 MOVD reg64, xmmreg Double FSTORE 4
1038 MOVD reg32, xmmreg Double FSTORE 4
1039 On AMDFAM10:
1040 MOVD reg64, xmmreg Double FADD 3
1041 1/1 1/1
1042 MOVD reg32, xmmreg Double FADD 3
1043 1/1 1/1 */
1044 16, /* size of l1 cache. */
1045 2048, /* size of l2 cache. */
1046 64, /* size of prefetch block */
1047 /* New AMD processors never drop prefetches; if they cannot be performed
1048 immediately, they are queued. We set number of simultaneous prefetches
1049 to a large constant to reflect this (it probably is not a good idea not
1050 to limit number of prefetches at all, as their execution also takes some
1051 time). */
1052 100, /* number of parallel prefetches */
1053 2, /* Branch cost */
1054 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1055 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1056 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1057 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1058 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1059 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1061 bdver2_memcpy,
1062 bdver2_memset,
1063 6, /* scalar_stmt_cost. */
1064 4, /* scalar load_cost. */
1065 4, /* scalar_store_cost. */
1066 6, /* vec_stmt_cost. */
1067 0, /* vec_to_scalar_cost. */
1068 2, /* scalar_to_vec_cost. */
1069 4, /* vec_align_load_cost. */
1070 4, /* vec_unalign_load_cost. */
1071 4, /* vec_store_cost. */
1072 2, /* cond_taken_branch_cost. */
1073 1, /* cond_not_taken_branch_cost. */
1077 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1078 very small blocks it is better to use loop. For large blocks, libcall
1079 can do nontemporary accesses and beat inline considerably. */
1080 static stringop_algs bdver3_memcpy[2] = {
1081 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1082 {-1, rep_prefix_4_byte, false}}},
1083 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1085 static stringop_algs bdver3_memset[2] = {
1086 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1087 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1088 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1089 {-1, libcall, false}}}};
1090 struct processor_costs bdver3_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (1), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (4), /* SI */
1098 COSTS_N_INSNS (6), /* DI */
1099 COSTS_N_INSNS (6)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (35), /* HI */
1103 COSTS_N_INSNS (51), /* SI */
1104 COSTS_N_INSNS (83), /* DI */
1105 COSTS_N_INSNS (83)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {5, 5, 4}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {4, 4, 4}, /* cost of storing integer registers */
1115 2, /* cost of reg,reg fld/fst */
1116 {5, 5, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {4, 4, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {4, 4}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 4, 4}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 4}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 2, /* MMX or SSE register to integer */
1131 16, /* size of l1 cache. */
1132 2048, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 2, /* Branch cost */
1141 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1148 bdver3_memcpy,
1149 bdver3_memset,
1150 6, /* scalar_stmt_cost. */
1151 4, /* scalar load_cost. */
1152 4, /* scalar_store_cost. */
1153 6, /* vec_stmt_cost. */
1154 0, /* vec_to_scalar_cost. */
1155 2, /* scalar_to_vec_cost. */
1156 4, /* vec_align_load_cost. */
1157 4, /* vec_unalign_load_cost. */
1158 4, /* vec_store_cost. */
1159 2, /* cond_taken_branch_cost. */
1160 1, /* cond_not_taken_branch_cost. */
1163 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1164 very small blocks it is better to use loop. For large blocks, libcall can
1165 do nontemporary accesses and beat inline considerably. */
1166 static stringop_algs btver1_memcpy[2] = {
1167 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1168 {-1, rep_prefix_4_byte, false}}},
1169 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}};
1171 static stringop_algs btver1_memset[2] = {
1172 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1174 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 const struct processor_costs btver1_cost = {
1177 COSTS_N_INSNS (1), /* cost of an add instruction */
1178 COSTS_N_INSNS (2), /* cost of a lea instruction */
1179 COSTS_N_INSNS (1), /* variable shift costs */
1180 COSTS_N_INSNS (1), /* constant shift costs */
1181 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1182 COSTS_N_INSNS (4), /* HI */
1183 COSTS_N_INSNS (3), /* SI */
1184 COSTS_N_INSNS (4), /* DI */
1185 COSTS_N_INSNS (5)}, /* other */
1186 0, /* cost of multiply per each bit set */
1187 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1188 COSTS_N_INSNS (35), /* HI */
1189 COSTS_N_INSNS (51), /* SI */
1190 COSTS_N_INSNS (83), /* DI */
1191 COSTS_N_INSNS (83)}, /* other */
1192 COSTS_N_INSNS (1), /* cost of movsx */
1193 COSTS_N_INSNS (1), /* cost of movzx */
1194 8, /* "large" insn */
1195 9, /* MOVE_RATIO */
1196 4, /* cost for loading QImode using movzbl */
1197 {3, 4, 3}, /* cost of loading integer registers
1198 in QImode, HImode and SImode.
1199 Relative to reg-reg move (2). */
1200 {3, 4, 3}, /* cost of storing integer registers */
1201 4, /* cost of reg,reg fld/fst */
1202 {4, 4, 12}, /* cost of loading fp registers
1203 in SFmode, DFmode and XFmode */
1204 {6, 6, 8}, /* cost of storing fp registers
1205 in SFmode, DFmode and XFmode */
1206 2, /* cost of moving MMX register */
1207 {3, 3}, /* cost of loading MMX registers
1208 in SImode and DImode */
1209 {4, 4}, /* cost of storing MMX registers
1210 in SImode and DImode */
1211 2, /* cost of moving SSE register */
1212 {4, 4, 3}, /* cost of loading SSE registers
1213 in SImode, DImode and TImode */
1214 {4, 4, 5}, /* cost of storing SSE registers
1215 in SImode, DImode and TImode */
1216 3, /* MMX or SSE register to integer */
1217 /* On K8:
1218 MOVD reg64, xmmreg Double FSTORE 4
1219 MOVD reg32, xmmreg Double FSTORE 4
1220 On AMDFAM10:
1221 MOVD reg64, xmmreg Double FADD 3
1222 1/1 1/1
1223 MOVD reg32, xmmreg Double FADD 3
1224 1/1 1/1 */
1225 32, /* size of l1 cache. */
1226 512, /* size of l2 cache. */
1227 64, /* size of prefetch block */
1228 100, /* number of parallel prefetches */
1229 2, /* Branch cost */
1230 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1231 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1232 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1233 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1234 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1235 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1237 btver1_memcpy,
1238 btver1_memset,
1239 4, /* scalar_stmt_cost. */
1240 2, /* scalar load_cost. */
1241 2, /* scalar_store_cost. */
1242 6, /* vec_stmt_cost. */
1243 0, /* vec_to_scalar_cost. */
1244 2, /* scalar_to_vec_cost. */
1245 2, /* vec_align_load_cost. */
1246 2, /* vec_unalign_load_cost. */
1247 2, /* vec_store_cost. */
1248 2, /* cond_taken_branch_cost. */
1249 1, /* cond_not_taken_branch_cost. */
1252 static stringop_algs btver2_memcpy[2] = {
1253 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1254 {-1, rep_prefix_4_byte, false}}},
1255 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1256 {-1, libcall, false}}}};
1257 static stringop_algs btver2_memset[2] = {
1258 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1259 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1260 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 const struct processor_costs btver2_cost = {
1263 COSTS_N_INSNS (1), /* cost of an add instruction */
1264 COSTS_N_INSNS (2), /* cost of a lea instruction */
1265 COSTS_N_INSNS (1), /* variable shift costs */
1266 COSTS_N_INSNS (1), /* constant shift costs */
1267 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1268 COSTS_N_INSNS (4), /* HI */
1269 COSTS_N_INSNS (3), /* SI */
1270 COSTS_N_INSNS (4), /* DI */
1271 COSTS_N_INSNS (5)}, /* other */
1272 0, /* cost of multiply per each bit set */
1273 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1274 COSTS_N_INSNS (35), /* HI */
1275 COSTS_N_INSNS (51), /* SI */
1276 COSTS_N_INSNS (83), /* DI */
1277 COSTS_N_INSNS (83)}, /* other */
1278 COSTS_N_INSNS (1), /* cost of movsx */
1279 COSTS_N_INSNS (1), /* cost of movzx */
1280 8, /* "large" insn */
1281 9, /* MOVE_RATIO */
1282 4, /* cost for loading QImode using movzbl */
1283 {3, 4, 3}, /* cost of loading integer registers
1284 in QImode, HImode and SImode.
1285 Relative to reg-reg move (2). */
1286 {3, 4, 3}, /* cost of storing integer registers */
1287 4, /* cost of reg,reg fld/fst */
1288 {4, 4, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode */
1290 {6, 6, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode */
1292 2, /* cost of moving MMX register */
1293 {3, 3}, /* cost of loading MMX registers
1294 in SImode and DImode */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode */
1297 2, /* cost of moving SSE register */
1298 {4, 4, 3}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode */
1300 {4, 4, 5}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode */
1302 3, /* MMX or SSE register to integer */
1303 /* On K8:
1304 MOVD reg64, xmmreg Double FSTORE 4
1305 MOVD reg32, xmmreg Double FSTORE 4
1306 On AMDFAM10:
1307 MOVD reg64, xmmreg Double FADD 3
1308 1/1 1/1
1309 MOVD reg32, xmmreg Double FADD 3
1310 1/1 1/1 */
1311 32, /* size of l1 cache. */
1312 2048, /* size of l2 cache. */
1313 64, /* size of prefetch block */
1314 100, /* number of parallel prefetches */
1315 2, /* Branch cost */
1316 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1317 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1318 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1319 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1320 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1321 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1322 btver2_memcpy,
1323 btver2_memset,
1324 4, /* scalar_stmt_cost. */
1325 2, /* scalar load_cost. */
1326 2, /* scalar_store_cost. */
1327 6, /* vec_stmt_cost. */
1328 0, /* vec_to_scalar_cost. */
1329 2, /* scalar_to_vec_cost. */
1330 2, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 2, /* vec_store_cost. */
1333 2, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static stringop_algs pentium4_memcpy[2] = {
1338 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1339 DUMMY_STRINGOP_ALGS};
1340 static stringop_algs pentium4_memset[2] = {
1341 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1342 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1343 DUMMY_STRINGOP_ALGS};
1345 static const
1346 struct processor_costs pentium4_cost = {
1347 COSTS_N_INSNS (1), /* cost of an add instruction */
1348 COSTS_N_INSNS (3), /* cost of a lea instruction */
1349 COSTS_N_INSNS (4), /* variable shift costs */
1350 COSTS_N_INSNS (4), /* constant shift costs */
1351 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1352 COSTS_N_INSNS (15), /* HI */
1353 COSTS_N_INSNS (15), /* SI */
1354 COSTS_N_INSNS (15), /* DI */
1355 COSTS_N_INSNS (15)}, /* other */
1356 0, /* cost of multiply per each bit set */
1357 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1358 COSTS_N_INSNS (56), /* HI */
1359 COSTS_N_INSNS (56), /* SI */
1360 COSTS_N_INSNS (56), /* DI */
1361 COSTS_N_INSNS (56)}, /* other */
1362 COSTS_N_INSNS (1), /* cost of movsx */
1363 COSTS_N_INSNS (1), /* cost of movzx */
1364 16, /* "large" insn */
1365 6, /* MOVE_RATIO */
1366 2, /* cost for loading QImode using movzbl */
1367 {4, 5, 4}, /* cost of loading integer registers
1368 in QImode, HImode and SImode.
1369 Relative to reg-reg move (2). */
1370 {2, 3, 2}, /* cost of storing integer registers */
1371 2, /* cost of reg,reg fld/fst */
1372 {2, 2, 6}, /* cost of loading fp registers
1373 in SFmode, DFmode and XFmode */
1374 {4, 4, 6}, /* cost of storing fp registers
1375 in SFmode, DFmode and XFmode */
1376 2, /* cost of moving MMX register */
1377 {2, 2}, /* cost of loading MMX registers
1378 in SImode and DImode */
1379 {2, 2}, /* cost of storing MMX registers
1380 in SImode and DImode */
1381 12, /* cost of moving SSE register */
1382 {12, 12, 12}, /* cost of loading SSE registers
1383 in SImode, DImode and TImode */
1384 {2, 2, 8}, /* cost of storing SSE registers
1385 in SImode, DImode and TImode */
1386 10, /* MMX or SSE register to integer */
1387 8, /* size of l1 cache. */
1388 256, /* size of l2 cache. */
1389 64, /* size of prefetch block */
1390 6, /* number of parallel prefetches */
1391 2, /* Branch cost */
1392 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1393 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1394 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1395 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1396 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1397 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1398 pentium4_memcpy,
1399 pentium4_memset,
1400 1, /* scalar_stmt_cost. */
1401 1, /* scalar load_cost. */
1402 1, /* scalar_store_cost. */
1403 1, /* vec_stmt_cost. */
1404 1, /* vec_to_scalar_cost. */
1405 1, /* scalar_to_vec_cost. */
1406 1, /* vec_align_load_cost. */
1407 2, /* vec_unalign_load_cost. */
1408 1, /* vec_store_cost. */
1409 3, /* cond_taken_branch_cost. */
1410 1, /* cond_not_taken_branch_cost. */
1413 static stringop_algs nocona_memcpy[2] = {
1414 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1415 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1416 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1418 static stringop_algs nocona_memset[2] = {
1419 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1420 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1421 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1422 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1424 static const
1425 struct processor_costs nocona_cost = {
1426 COSTS_N_INSNS (1), /* cost of an add instruction */
1427 COSTS_N_INSNS (1), /* cost of a lea instruction */
1428 COSTS_N_INSNS (1), /* variable shift costs */
1429 COSTS_N_INSNS (1), /* constant shift costs */
1430 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1431 COSTS_N_INSNS (10), /* HI */
1432 COSTS_N_INSNS (10), /* SI */
1433 COSTS_N_INSNS (10), /* DI */
1434 COSTS_N_INSNS (10)}, /* other */
1435 0, /* cost of multiply per each bit set */
1436 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1437 COSTS_N_INSNS (66), /* HI */
1438 COSTS_N_INSNS (66), /* SI */
1439 COSTS_N_INSNS (66), /* DI */
1440 COSTS_N_INSNS (66)}, /* other */
1441 COSTS_N_INSNS (1), /* cost of movsx */
1442 COSTS_N_INSNS (1), /* cost of movzx */
1443 16, /* "large" insn */
1444 17, /* MOVE_RATIO */
1445 4, /* cost for loading QImode using movzbl */
1446 {4, 4, 4}, /* cost of loading integer registers
1447 in QImode, HImode and SImode.
1448 Relative to reg-reg move (2). */
1449 {4, 4, 4}, /* cost of storing integer registers */
1450 3, /* cost of reg,reg fld/fst */
1451 {12, 12, 12}, /* cost of loading fp registers
1452 in SFmode, DFmode and XFmode */
1453 {4, 4, 4}, /* cost of storing fp registers
1454 in SFmode, DFmode and XFmode */
1455 6, /* cost of moving MMX register */
1456 {12, 12}, /* cost of loading MMX registers
1457 in SImode and DImode */
1458 {12, 12}, /* cost of storing MMX registers
1459 in SImode and DImode */
1460 6, /* cost of moving SSE register */
1461 {12, 12, 12}, /* cost of loading SSE registers
1462 in SImode, DImode and TImode */
1463 {12, 12, 12}, /* cost of storing SSE registers
1464 in SImode, DImode and TImode */
1465 8, /* MMX or SSE register to integer */
1466 8, /* size of l1 cache. */
1467 1024, /* size of l2 cache. */
1468 128, /* size of prefetch block */
1469 8, /* number of parallel prefetches */
1470 1, /* Branch cost */
1471 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1472 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1473 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1474 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1475 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1476 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1477 nocona_memcpy,
1478 nocona_memset,
1479 1, /* scalar_stmt_cost. */
1480 1, /* scalar load_cost. */
1481 1, /* scalar_store_cost. */
1482 1, /* vec_stmt_cost. */
1483 1, /* vec_to_scalar_cost. */
1484 1, /* scalar_to_vec_cost. */
1485 1, /* vec_align_load_cost. */
1486 2, /* vec_unalign_load_cost. */
1487 1, /* vec_store_cost. */
1488 3, /* cond_taken_branch_cost. */
1489 1, /* cond_not_taken_branch_cost. */
1492 static stringop_algs atom_memcpy[2] = {
1493 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1494 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1495 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1496 static stringop_algs atom_memset[2] = {
1497 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1498 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1499 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1500 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1501 static const
1502 struct processor_costs atom_cost = {
1503 COSTS_N_INSNS (1), /* cost of an add instruction */
1504 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1505 COSTS_N_INSNS (1), /* variable shift costs */
1506 COSTS_N_INSNS (1), /* constant shift costs */
1507 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1508 COSTS_N_INSNS (4), /* HI */
1509 COSTS_N_INSNS (3), /* SI */
1510 COSTS_N_INSNS (4), /* DI */
1511 COSTS_N_INSNS (2)}, /* other */
1512 0, /* cost of multiply per each bit set */
1513 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1514 COSTS_N_INSNS (26), /* HI */
1515 COSTS_N_INSNS (42), /* SI */
1516 COSTS_N_INSNS (74), /* DI */
1517 COSTS_N_INSNS (74)}, /* other */
1518 COSTS_N_INSNS (1), /* cost of movsx */
1519 COSTS_N_INSNS (1), /* cost of movzx */
1520 8, /* "large" insn */
1521 17, /* MOVE_RATIO */
1522 4, /* cost for loading QImode using movzbl */
1523 {4, 4, 4}, /* cost of loading integer registers
1524 in QImode, HImode and SImode.
1525 Relative to reg-reg move (2). */
1526 {4, 4, 4}, /* cost of storing integer registers */
1527 4, /* cost of reg,reg fld/fst */
1528 {12, 12, 12}, /* cost of loading fp registers
1529 in SFmode, DFmode and XFmode */
1530 {6, 6, 8}, /* cost of storing fp registers
1531 in SFmode, DFmode and XFmode */
1532 2, /* cost of moving MMX register */
1533 {8, 8}, /* cost of loading MMX registers
1534 in SImode and DImode */
1535 {8, 8}, /* cost of storing MMX registers
1536 in SImode and DImode */
1537 2, /* cost of moving SSE register */
1538 {8, 8, 8}, /* cost of loading SSE registers
1539 in SImode, DImode and TImode */
1540 {8, 8, 8}, /* cost of storing SSE registers
1541 in SImode, DImode and TImode */
1542 5, /* MMX or SSE register to integer */
1543 32, /* size of l1 cache. */
1544 256, /* size of l2 cache. */
1545 64, /* size of prefetch block */
1546 6, /* number of parallel prefetches */
1547 3, /* Branch cost */
1548 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1549 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1550 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1551 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1552 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1553 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1554 atom_memcpy,
1555 atom_memset,
1556 1, /* scalar_stmt_cost. */
1557 1, /* scalar load_cost. */
1558 1, /* scalar_store_cost. */
1559 1, /* vec_stmt_cost. */
1560 1, /* vec_to_scalar_cost. */
1561 1, /* scalar_to_vec_cost. */
1562 1, /* vec_align_load_cost. */
1563 2, /* vec_unalign_load_cost. */
1564 1, /* vec_store_cost. */
1565 3, /* cond_taken_branch_cost. */
1566 1, /* cond_not_taken_branch_cost. */
1569 static stringop_algs slm_memcpy[2] = {
1570 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1571 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1572 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1573 static stringop_algs slm_memset[2] = {
1574 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1575 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1576 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1577 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1578 static const
1579 struct processor_costs slm_cost = {
1580 COSTS_N_INSNS (1), /* cost of an add instruction */
1581 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1582 COSTS_N_INSNS (1), /* variable shift costs */
1583 COSTS_N_INSNS (1), /* constant shift costs */
1584 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1585 COSTS_N_INSNS (4), /* HI */
1586 COSTS_N_INSNS (3), /* SI */
1587 COSTS_N_INSNS (4), /* DI */
1588 COSTS_N_INSNS (2)}, /* other */
1589 0, /* cost of multiply per each bit set */
1590 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1591 COSTS_N_INSNS (26), /* HI */
1592 COSTS_N_INSNS (42), /* SI */
1593 COSTS_N_INSNS (74), /* DI */
1594 COSTS_N_INSNS (74)}, /* other */
1595 COSTS_N_INSNS (1), /* cost of movsx */
1596 COSTS_N_INSNS (1), /* cost of movzx */
1597 8, /* "large" insn */
1598 17, /* MOVE_RATIO */
1599 4, /* cost for loading QImode using movzbl */
1600 {4, 4, 4}, /* cost of loading integer registers
1601 in QImode, HImode and SImode.
1602 Relative to reg-reg move (2). */
1603 {4, 4, 4}, /* cost of storing integer registers */
1604 4, /* cost of reg,reg fld/fst */
1605 {12, 12, 12}, /* cost of loading fp registers
1606 in SFmode, DFmode and XFmode */
1607 {6, 6, 8}, /* cost of storing fp registers
1608 in SFmode, DFmode and XFmode */
1609 2, /* cost of moving MMX register */
1610 {8, 8}, /* cost of loading MMX registers
1611 in SImode and DImode */
1612 {8, 8}, /* cost of storing MMX registers
1613 in SImode and DImode */
1614 2, /* cost of moving SSE register */
1615 {8, 8, 8}, /* cost of loading SSE registers
1616 in SImode, DImode and TImode */
1617 {8, 8, 8}, /* cost of storing SSE registers
1618 in SImode, DImode and TImode */
1619 5, /* MMX or SSE register to integer */
1620 32, /* size of l1 cache. */
1621 256, /* size of l2 cache. */
1622 64, /* size of prefetch block */
1623 6, /* number of parallel prefetches */
1624 3, /* Branch cost */
1625 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1626 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1627 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1628 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1629 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1630 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1631 slm_memcpy,
1632 slm_memset,
1633 1, /* scalar_stmt_cost. */
1634 1, /* scalar load_cost. */
1635 1, /* scalar_store_cost. */
1636 1, /* vec_stmt_cost. */
1637 1, /* vec_to_scalar_cost. */
1638 1, /* scalar_to_vec_cost. */
1639 1, /* vec_align_load_cost. */
1640 2, /* vec_unalign_load_cost. */
1641 1, /* vec_store_cost. */
1642 3, /* cond_taken_branch_cost. */
1643 1, /* cond_not_taken_branch_cost. */
1646 /* Generic should produce code tuned for Core-i7 (and newer chips)
1647 and btver1 (and newer chips). */
1649 static stringop_algs generic_memcpy[2] = {
1650 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1651 {-1, libcall, false}}},
1652 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1653 {-1, libcall, false}}}};
1654 static stringop_algs generic_memset[2] = {
1655 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1656 {-1, libcall, false}}},
1657 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1658 {-1, libcall, false}}}};
1659 static const
1660 struct processor_costs generic_cost = {
1661 COSTS_N_INSNS (1), /* cost of an add instruction */
1662 /* On all chips taken into consideration lea is 2 cycles and more. With
1663 this cost however our current implementation of synth_mult results in
1664 use of unnecessary temporary registers causing regression on several
1665 SPECfp benchmarks. */
1666 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1667 COSTS_N_INSNS (1), /* variable shift costs */
1668 COSTS_N_INSNS (1), /* constant shift costs */
1669 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1670 COSTS_N_INSNS (4), /* HI */
1671 COSTS_N_INSNS (3), /* SI */
1672 COSTS_N_INSNS (4), /* DI */
1673 COSTS_N_INSNS (2)}, /* other */
1674 0, /* cost of multiply per each bit set */
1675 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1676 COSTS_N_INSNS (26), /* HI */
1677 COSTS_N_INSNS (42), /* SI */
1678 COSTS_N_INSNS (74), /* DI */
1679 COSTS_N_INSNS (74)}, /* other */
1680 COSTS_N_INSNS (1), /* cost of movsx */
1681 COSTS_N_INSNS (1), /* cost of movzx */
1682 8, /* "large" insn */
1683 17, /* MOVE_RATIO */
1684 4, /* cost for loading QImode using movzbl */
1685 {4, 4, 4}, /* cost of loading integer registers
1686 in QImode, HImode and SImode.
1687 Relative to reg-reg move (2). */
1688 {4, 4, 4}, /* cost of storing integer registers */
1689 4, /* cost of reg,reg fld/fst */
1690 {12, 12, 12}, /* cost of loading fp registers
1691 in SFmode, DFmode and XFmode */
1692 {6, 6, 8}, /* cost of storing fp registers
1693 in SFmode, DFmode and XFmode */
1694 2, /* cost of moving MMX register */
1695 {8, 8}, /* cost of loading MMX registers
1696 in SImode and DImode */
1697 {8, 8}, /* cost of storing MMX registers
1698 in SImode and DImode */
1699 2, /* cost of moving SSE register */
1700 {8, 8, 8}, /* cost of loading SSE registers
1701 in SImode, DImode and TImode */
1702 {8, 8, 8}, /* cost of storing SSE registers
1703 in SImode, DImode and TImode */
1704 5, /* MMX or SSE register to integer */
1705 32, /* size of l1 cache. */
1706 512, /* size of l2 cache. */
1707 64, /* size of prefetch block */
1708 6, /* number of parallel prefetches */
1709 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1710 value is increased to perhaps more appropriate value of 5. */
1711 3, /* Branch cost */
1712 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1713 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1714 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1715 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1716 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1717 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1718 generic_memcpy,
1719 generic_memset,
1720 1, /* scalar_stmt_cost. */
1721 1, /* scalar load_cost. */
1722 1, /* scalar_store_cost. */
1723 1, /* vec_stmt_cost. */
1724 1, /* vec_to_scalar_cost. */
1725 1, /* scalar_to_vec_cost. */
1726 1, /* vec_align_load_cost. */
1727 2, /* vec_unalign_load_cost. */
1728 1, /* vec_store_cost. */
1729 3, /* cond_taken_branch_cost. */
1730 1, /* cond_not_taken_branch_cost. */
1733 /* core_cost should produce code tuned for Core familly of CPUs. */
1734 static stringop_algs core_memcpy[2] = {
1735 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1736 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1737 {-1, libcall, false}}}};
1738 static stringop_algs core_memset[2] = {
1739 {libcall, {{6, loop_1_byte, true},
1740 {24, loop, true},
1741 {8192, rep_prefix_4_byte, true},
1742 {-1, libcall, false}}},
1743 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1744 {-1, libcall, false}}}};
1746 static const
1747 struct processor_costs core_cost = {
1748 COSTS_N_INSNS (1), /* cost of an add instruction */
1749 /* On all chips taken into consideration lea is 2 cycles and more. With
1750 this cost however our current implementation of synth_mult results in
1751 use of unnecessary temporary registers causing regression on several
1752 SPECfp benchmarks. */
1753 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1754 COSTS_N_INSNS (1), /* variable shift costs */
1755 COSTS_N_INSNS (1), /* constant shift costs */
1756 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1757 COSTS_N_INSNS (4), /* HI */
1758 COSTS_N_INSNS (3), /* SI */
1759 COSTS_N_INSNS (4), /* DI */
1760 COSTS_N_INSNS (2)}, /* other */
1761 0, /* cost of multiply per each bit set */
1762 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1763 COSTS_N_INSNS (26), /* HI */
1764 COSTS_N_INSNS (42), /* SI */
1765 COSTS_N_INSNS (74), /* DI */
1766 COSTS_N_INSNS (74)}, /* other */
1767 COSTS_N_INSNS (1), /* cost of movsx */
1768 COSTS_N_INSNS (1), /* cost of movzx */
1769 8, /* "large" insn */
1770 17, /* MOVE_RATIO */
1771 4, /* cost for loading QImode using movzbl */
1772 {4, 4, 4}, /* cost of loading integer registers
1773 in QImode, HImode and SImode.
1774 Relative to reg-reg move (2). */
1775 {4, 4, 4}, /* cost of storing integer registers */
1776 4, /* cost of reg,reg fld/fst */
1777 {12, 12, 12}, /* cost of loading fp registers
1778 in SFmode, DFmode and XFmode */
1779 {6, 6, 8}, /* cost of storing fp registers
1780 in SFmode, DFmode and XFmode */
1781 2, /* cost of moving MMX register */
1782 {8, 8}, /* cost of loading MMX registers
1783 in SImode and DImode */
1784 {8, 8}, /* cost of storing MMX registers
1785 in SImode and DImode */
1786 2, /* cost of moving SSE register */
1787 {8, 8, 8}, /* cost of loading SSE registers
1788 in SImode, DImode and TImode */
1789 {8, 8, 8}, /* cost of storing SSE registers
1790 in SImode, DImode and TImode */
1791 5, /* MMX or SSE register to integer */
1792 64, /* size of l1 cache. */
1793 512, /* size of l2 cache. */
1794 64, /* size of prefetch block */
1795 6, /* number of parallel prefetches */
1796 /* FIXME perhaps more appropriate value is 5. */
1797 3, /* Branch cost */
1798 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1799 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1800 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1801 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1802 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1803 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1804 core_memcpy,
1805 core_memset,
1806 1, /* scalar_stmt_cost. */
1807 1, /* scalar load_cost. */
1808 1, /* scalar_store_cost. */
1809 1, /* vec_stmt_cost. */
1810 1, /* vec_to_scalar_cost. */
1811 1, /* scalar_to_vec_cost. */
1812 1, /* vec_align_load_cost. */
1813 2, /* vec_unalign_load_cost. */
1814 1, /* vec_store_cost. */
1815 3, /* cond_taken_branch_cost. */
1816 1, /* cond_not_taken_branch_cost. */
1820 /* Set by -mtune. */
1821 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1823 /* Set by -mtune or -Os. */
1824 const struct processor_costs *ix86_cost = &pentium_cost;
1826 /* Processor feature/optimization bitmasks. */
1827 #define m_386 (1<<PROCESSOR_I386)
1828 #define m_486 (1<<PROCESSOR_I486)
1829 #define m_PENT (1<<PROCESSOR_PENTIUM)
1830 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1831 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1832 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1833 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1834 #define m_CORE2 (1<<PROCESSOR_CORE2)
1835 #define m_COREI7 (1<<PROCESSOR_COREI7)
1836 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1837 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1838 #define m_ATOM (1<<PROCESSOR_ATOM)
1839 #define m_SLM (1<<PROCESSOR_SLM)
1841 #define m_GEODE (1<<PROCESSOR_GEODE)
1842 #define m_K6 (1<<PROCESSOR_K6)
1843 #define m_K6_GEODE (m_K6 | m_GEODE)
1844 #define m_K8 (1<<PROCESSOR_K8)
1845 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1846 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1847 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1848 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1849 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1850 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1851 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1852 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1853 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1854 #define m_BTVER (m_BTVER1 | m_BTVER2)
1855 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1857 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1859 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1860 #undef DEF_TUNE
1861 #define DEF_TUNE(tune, name, selector) name,
1862 #include "x86-tune.def"
1863 #undef DEF_TUNE
1866 /* Feature tests against the various tunings. */
1867 unsigned char ix86_tune_features[X86_TUNE_LAST];
1869 /* Feature tests against the various tunings used to create ix86_tune_features
1870 based on the processor mask. */
1871 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1872 #undef DEF_TUNE
1873 #define DEF_TUNE(tune, name, selector) selector,
1874 #include "x86-tune.def"
1875 #undef DEF_TUNE
1878 /* Feature tests against the various architecture variations. */
1879 unsigned char ix86_arch_features[X86_ARCH_LAST];
1881 /* Feature tests against the various architecture variations, used to create
1882 ix86_arch_features based on the processor mask. */
1883 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1884 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1885 ~(m_386 | m_486 | m_PENT | m_K6),
1887 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1888 ~m_386,
1890 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1891 ~(m_386 | m_486),
1893 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1894 ~m_386,
1896 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1897 ~m_386,
1900 /* In case the average insn count for single function invocation is
1901 lower than this constant, emit fast (but longer) prologue and
1902 epilogue code. */
1903 #define FAST_PROLOGUE_INSN_COUNT 20
1905 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1906 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1907 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1908 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1910 /* Array of the smallest class containing reg number REGNO, indexed by
1911 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1913 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1915 /* ax, dx, cx, bx */
1916 AREG, DREG, CREG, BREG,
1917 /* si, di, bp, sp */
1918 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1919 /* FP registers */
1920 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1921 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1922 /* arg pointer */
1923 NON_Q_REGS,
1924 /* flags, fpsr, fpcr, frame */
1925 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1926 /* SSE registers */
1927 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1928 SSE_REGS, SSE_REGS,
1929 /* MMX registers */
1930 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1931 MMX_REGS, MMX_REGS,
1932 /* REX registers */
1933 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1934 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1935 /* SSE REX registers */
1936 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1937 SSE_REGS, SSE_REGS,
1938 /* AVX-512 SSE registers */
1939 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1940 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1941 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1942 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1943 /* Mask registers. */
1944 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1945 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1946 /* MPX bound registers */
1947 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
1950 /* The "default" register map used in 32bit mode. */
1952 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1954 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1955 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1956 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1957 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1958 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1959 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1960 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1961 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
1962 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
1963 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
1964 101, 102, 103, 104, /* bound registers */
1967 /* The "default" register map used in 64bit mode. */
1969 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1971 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1972 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1973 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1974 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1975 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1976 8,9,10,11,12,13,14,15, /* extended integer registers */
1977 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1978 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
1979 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
1980 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
1981 126, 127, 128, 129, /* bound registers */
1984 /* Define the register numbers to be used in Dwarf debugging information.
1985 The SVR4 reference port C compiler uses the following register numbers
1986 in its Dwarf output code:
1987 0 for %eax (gcc regno = 0)
1988 1 for %ecx (gcc regno = 2)
1989 2 for %edx (gcc regno = 1)
1990 3 for %ebx (gcc regno = 3)
1991 4 for %esp (gcc regno = 7)
1992 5 for %ebp (gcc regno = 6)
1993 6 for %esi (gcc regno = 4)
1994 7 for %edi (gcc regno = 5)
1995 The following three DWARF register numbers are never generated by
1996 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1997 believes these numbers have these meanings.
1998 8 for %eip (no gcc equivalent)
1999 9 for %eflags (gcc regno = 17)
2000 10 for %trapno (no gcc equivalent)
2001 It is not at all clear how we should number the FP stack registers
2002 for the x86 architecture. If the version of SDB on x86/svr4 were
2003 a bit less brain dead with respect to floating-point then we would
2004 have a precedent to follow with respect to DWARF register numbers
2005 for x86 FP registers, but the SDB on x86/svr4 is so completely
2006 broken with respect to FP registers that it is hardly worth thinking
2007 of it as something to strive for compatibility with.
2008 The version of x86/svr4 SDB I have at the moment does (partially)
2009 seem to believe that DWARF register number 11 is associated with
2010 the x86 register %st(0), but that's about all. Higher DWARF
2011 register numbers don't seem to be associated with anything in
2012 particular, and even for DWARF regno 11, SDB only seems to under-
2013 stand that it should say that a variable lives in %st(0) (when
2014 asked via an `=' command) if we said it was in DWARF regno 11,
2015 but SDB still prints garbage when asked for the value of the
2016 variable in question (via a `/' command).
2017 (Also note that the labels SDB prints for various FP stack regs
2018 when doing an `x' command are all wrong.)
2019 Note that these problems generally don't affect the native SVR4
2020 C compiler because it doesn't allow the use of -O with -g and
2021 because when it is *not* optimizing, it allocates a memory
2022 location for each floating-point variable, and the memory
2023 location is what gets described in the DWARF AT_location
2024 attribute for the variable in question.
2025 Regardless of the severe mental illness of the x86/svr4 SDB, we
2026 do something sensible here and we use the following DWARF
2027 register numbers. Note that these are all stack-top-relative
2028 numbers.
2029 11 for %st(0) (gcc regno = 8)
2030 12 for %st(1) (gcc regno = 9)
2031 13 for %st(2) (gcc regno = 10)
2032 14 for %st(3) (gcc regno = 11)
2033 15 for %st(4) (gcc regno = 12)
2034 16 for %st(5) (gcc regno = 13)
2035 17 for %st(6) (gcc regno = 14)
2036 18 for %st(7) (gcc regno = 15)
2038 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2040 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2041 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2042 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2043 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2044 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2045 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2046 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2047 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2048 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2049 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2050 -1, -1, -1, -1, /* bound registers */
2053 /* Define parameter passing and return registers. */
2055 static int const x86_64_int_parameter_registers[6] =
2057 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2060 static int const x86_64_ms_abi_int_parameter_registers[4] =
2062 CX_REG, DX_REG, R8_REG, R9_REG
2065 static int const x86_64_int_return_registers[4] =
2067 AX_REG, DX_REG, DI_REG, SI_REG
2070 /* Additional registers that are clobbered by SYSV calls. */
2072 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2074 SI_REG, DI_REG,
2075 XMM6_REG, XMM7_REG,
2076 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2077 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2080 /* Define the structure for the machine field in struct function. */
2082 struct GTY(()) stack_local_entry {
2083 unsigned short mode;
2084 unsigned short n;
2085 rtx rtl;
2086 struct stack_local_entry *next;
2089 /* Structure describing stack frame layout.
2090 Stack grows downward:
2092 [arguments]
2093 <- ARG_POINTER
2094 saved pc
2096 saved static chain if ix86_static_chain_on_stack
2098 saved frame pointer if frame_pointer_needed
2099 <- HARD_FRAME_POINTER
2100 [saved regs]
2101 <- regs_save_offset
2102 [padding0]
2104 [saved SSE regs]
2105 <- sse_regs_save_offset
2106 [padding1] |
2107 | <- FRAME_POINTER
2108 [va_arg registers] |
2110 [frame] |
2112 [padding2] | = to_allocate
2113 <- STACK_POINTER
2115 struct ix86_frame
2117 int nsseregs;
2118 int nregs;
2119 int va_arg_size;
2120 int red_zone_size;
2121 int outgoing_arguments_size;
2123 /* The offsets relative to ARG_POINTER. */
2124 HOST_WIDE_INT frame_pointer_offset;
2125 HOST_WIDE_INT hard_frame_pointer_offset;
2126 HOST_WIDE_INT stack_pointer_offset;
2127 HOST_WIDE_INT hfp_save_offset;
2128 HOST_WIDE_INT reg_save_offset;
2129 HOST_WIDE_INT sse_reg_save_offset;
2131 /* When save_regs_using_mov is set, emit prologue using
2132 move instead of push instructions. */
2133 bool save_regs_using_mov;
2136 /* Which cpu are we scheduling for. */
2137 enum attr_cpu ix86_schedule;
2139 /* Which cpu are we optimizing for. */
2140 enum processor_type ix86_tune;
2142 /* Which instruction set architecture to use. */
2143 enum processor_type ix86_arch;
2145 /* True if processor has SSE prefetch instruction. */
2146 unsigned char x86_prefetch_sse;
2148 /* -mstackrealign option */
2149 static const char ix86_force_align_arg_pointer_string[]
2150 = "force_align_arg_pointer";
2152 static rtx (*ix86_gen_leave) (void);
2153 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2154 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2155 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2156 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2157 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2158 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2159 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2160 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2161 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2162 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2163 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2165 /* Preferred alignment for stack boundary in bits. */
2166 unsigned int ix86_preferred_stack_boundary;
2168 /* Alignment for incoming stack boundary in bits specified at
2169 command line. */
2170 static unsigned int ix86_user_incoming_stack_boundary;
2172 /* Default alignment for incoming stack boundary in bits. */
2173 static unsigned int ix86_default_incoming_stack_boundary;
2175 /* Alignment for incoming stack boundary in bits. */
2176 unsigned int ix86_incoming_stack_boundary;
2178 /* Calling abi specific va_list type nodes. */
2179 static GTY(()) tree sysv_va_list_type_node;
2180 static GTY(()) tree ms_va_list_type_node;
2182 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2183 char internal_label_prefix[16];
2184 int internal_label_prefix_len;
2186 /* Fence to use after loop using movnt. */
2187 tree x86_mfence;
2189 /* Register class used for passing given 64bit part of the argument.
2190 These represent classes as documented by the PS ABI, with the exception
2191 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2192 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2194 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2195 whenever possible (upper half does contain padding). */
2196 enum x86_64_reg_class
2198 X86_64_NO_CLASS,
2199 X86_64_INTEGER_CLASS,
2200 X86_64_INTEGERSI_CLASS,
2201 X86_64_SSE_CLASS,
2202 X86_64_SSESF_CLASS,
2203 X86_64_SSEDF_CLASS,
2204 X86_64_SSEUP_CLASS,
2205 X86_64_X87_CLASS,
2206 X86_64_X87UP_CLASS,
2207 X86_64_COMPLEX_X87_CLASS,
2208 X86_64_MEMORY_CLASS
2211 #define MAX_CLASSES 4
2213 /* Table of constants used by fldpi, fldln2, etc.... */
2214 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2215 static bool ext_80387_constants_init = 0;
2218 static struct machine_function * ix86_init_machine_status (void);
2219 static rtx ix86_function_value (const_tree, const_tree, bool);
2220 static bool ix86_function_value_regno_p (const unsigned int);
2221 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2222 const_tree);
2223 static rtx ix86_static_chain (const_tree, bool);
2224 static int ix86_function_regparm (const_tree, const_tree);
2225 static void ix86_compute_frame_layout (struct ix86_frame *);
2226 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2227 rtx, rtx, int);
2228 static void ix86_add_new_builtins (HOST_WIDE_INT);
2229 static tree ix86_canonical_va_list_type (tree);
2230 static void predict_jump (int);
2231 static unsigned int split_stack_prologue_scratch_regno (void);
2232 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2234 enum ix86_function_specific_strings
2236 IX86_FUNCTION_SPECIFIC_ARCH,
2237 IX86_FUNCTION_SPECIFIC_TUNE,
2238 IX86_FUNCTION_SPECIFIC_MAX
2241 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2242 const char *, enum fpmath_unit, bool);
2243 static void ix86_function_specific_save (struct cl_target_option *,
2244 struct gcc_options *opts);
2245 static void ix86_function_specific_restore (struct gcc_options *opts,
2246 struct cl_target_option *);
2247 static void ix86_function_specific_print (FILE *, int,
2248 struct cl_target_option *);
2249 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2250 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2251 struct gcc_options *,
2252 struct gcc_options *,
2253 struct gcc_options *);
2254 static bool ix86_can_inline_p (tree, tree);
2255 static void ix86_set_current_function (tree);
2256 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2258 static enum calling_abi ix86_function_abi (const_tree);
2261 #ifndef SUBTARGET32_DEFAULT_CPU
2262 #define SUBTARGET32_DEFAULT_CPU "i386"
2263 #endif
2265 /* Whether -mtune= or -march= were specified */
2266 static int ix86_tune_defaulted;
2267 static int ix86_arch_specified;
2269 /* Vectorization library interface and handlers. */
2270 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2272 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2273 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2275 /* Processor target table, indexed by processor number */
2276 struct ptt
2278 const struct processor_costs *cost; /* Processor costs */
2279 const int align_loop; /* Default alignments. */
2280 const int align_loop_max_skip;
2281 const int align_jump;
2282 const int align_jump_max_skip;
2283 const int align_func;
2286 static const struct ptt processor_target_table[PROCESSOR_max] =
2288 {&i386_cost, 4, 3, 4, 3, 4},
2289 {&i486_cost, 16, 15, 16, 15, 16},
2290 {&pentium_cost, 16, 7, 16, 7, 16},
2291 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2292 {&geode_cost, 0, 0, 0, 0, 0},
2293 {&k6_cost, 32, 7, 32, 7, 32},
2294 {&athlon_cost, 16, 7, 16, 7, 16},
2295 {&pentium4_cost, 0, 0, 0, 0, 0},
2296 {&k8_cost, 16, 7, 16, 7, 16},
2297 {&nocona_cost, 0, 0, 0, 0, 0},
2298 /* Core 2 */
2299 {&core_cost, 16, 10, 16, 10, 16},
2300 /* Core i7 */
2301 {&core_cost, 16, 10, 16, 10, 16},
2302 /* Core avx2 */
2303 {&core_cost, 16, 10, 16, 10, 16},
2304 {&generic_cost, 16, 10, 16, 10, 16},
2305 {&amdfam10_cost, 32, 24, 32, 7, 32},
2306 {&bdver1_cost, 16, 10, 16, 7, 11},
2307 {&bdver2_cost, 16, 10, 16, 7, 11},
2308 {&bdver3_cost, 16, 10, 16, 7, 11},
2309 {&btver1_cost, 16, 10, 16, 7, 11},
2310 {&btver2_cost, 16, 10, 16, 7, 11},
2311 {&atom_cost, 16, 15, 16, 7, 16},
2312 {&slm_cost, 16, 15, 16, 7, 16}
2315 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2317 "generic",
2318 "i386",
2319 "i486",
2320 "pentium",
2321 "pentium-mmx",
2322 "pentiumpro",
2323 "pentium2",
2324 "pentium3",
2325 "pentium4",
2326 "pentium-m",
2327 "prescott",
2328 "nocona",
2329 "core2",
2330 "corei7",
2331 "core-avx2",
2332 "atom",
2333 "slm",
2334 "geode",
2335 "k6",
2336 "k6-2",
2337 "k6-3",
2338 "athlon",
2339 "athlon-4",
2340 "k8",
2341 "amdfam10",
2342 "bdver1",
2343 "bdver2",
2344 "bdver3",
2345 "btver1",
2346 "btver2"
2349 static bool
2350 gate_insert_vzeroupper (void)
2352 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2355 static unsigned int
2356 rest_of_handle_insert_vzeroupper (void)
2358 int i;
2360 /* vzeroupper instructions are inserted immediately after reload to
2361 account for possible spills from 256bit registers. The pass
2362 reuses mode switching infrastructure by re-running mode insertion
2363 pass, so disable entities that have already been processed. */
2364 for (i = 0; i < MAX_386_ENTITIES; i++)
2365 ix86_optimize_mode_switching[i] = 0;
2367 ix86_optimize_mode_switching[AVX_U128] = 1;
2369 /* Call optimize_mode_switching. */
2370 g->get_passes ()->execute_pass_mode_switching ();
2371 return 0;
2374 namespace {
2376 const pass_data pass_data_insert_vzeroupper =
2378 RTL_PASS, /* type */
2379 "vzeroupper", /* name */
2380 OPTGROUP_NONE, /* optinfo_flags */
2381 true, /* has_gate */
2382 true, /* has_execute */
2383 TV_NONE, /* tv_id */
2384 0, /* properties_required */
2385 0, /* properties_provided */
2386 0, /* properties_destroyed */
2387 0, /* todo_flags_start */
2388 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2391 class pass_insert_vzeroupper : public rtl_opt_pass
2393 public:
2394 pass_insert_vzeroupper(gcc::context *ctxt)
2395 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2398 /* opt_pass methods: */
2399 bool gate () { return gate_insert_vzeroupper (); }
2400 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2402 }; // class pass_insert_vzeroupper
2404 } // anon namespace
2406 rtl_opt_pass *
2407 make_pass_insert_vzeroupper (gcc::context *ctxt)
2409 return new pass_insert_vzeroupper (ctxt);
2412 /* Return true if a red-zone is in use. */
2414 static inline bool
2415 ix86_using_red_zone (void)
2417 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2420 /* Return a string that documents the current -m options. The caller is
2421 responsible for freeing the string. */
2423 static char *
2424 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2425 const char *tune, enum fpmath_unit fpmath,
2426 bool add_nl_p)
2428 struct ix86_target_opts
2430 const char *option; /* option string */
2431 HOST_WIDE_INT mask; /* isa mask options */
2434 /* This table is ordered so that options like -msse4.2 that imply
2435 preceding options while match those first. */
2436 static struct ix86_target_opts isa_opts[] =
2438 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2439 { "-mfma", OPTION_MASK_ISA_FMA },
2440 { "-mxop", OPTION_MASK_ISA_XOP },
2441 { "-mlwp", OPTION_MASK_ISA_LWP },
2442 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2443 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2444 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2445 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2446 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2447 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2448 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2449 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2450 { "-msse3", OPTION_MASK_ISA_SSE3 },
2451 { "-msse2", OPTION_MASK_ISA_SSE2 },
2452 { "-msse", OPTION_MASK_ISA_SSE },
2453 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2454 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2455 { "-mmmx", OPTION_MASK_ISA_MMX },
2456 { "-mabm", OPTION_MASK_ISA_ABM },
2457 { "-mbmi", OPTION_MASK_ISA_BMI },
2458 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2459 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2460 { "-mhle", OPTION_MASK_ISA_HLE },
2461 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2462 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2463 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2464 { "-madx", OPTION_MASK_ISA_ADX },
2465 { "-mtbm", OPTION_MASK_ISA_TBM },
2466 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2467 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2468 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2469 { "-maes", OPTION_MASK_ISA_AES },
2470 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2471 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2472 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2473 { "-mf16c", OPTION_MASK_ISA_F16C },
2474 { "-mrtm", OPTION_MASK_ISA_RTM },
2475 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2476 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2477 { "-mmpx", OPTION_MASK_ISA_MPX },
2480 /* Flag options. */
2481 static struct ix86_target_opts flag_opts[] =
2483 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2484 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2485 { "-m80387", MASK_80387 },
2486 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2487 { "-malign-double", MASK_ALIGN_DOUBLE },
2488 { "-mcld", MASK_CLD },
2489 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2490 { "-mieee-fp", MASK_IEEE_FP },
2491 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2492 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2493 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2494 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2495 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2496 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2497 { "-mno-red-zone", MASK_NO_RED_ZONE },
2498 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2499 { "-mrecip", MASK_RECIP },
2500 { "-mrtd", MASK_RTD },
2501 { "-msseregparm", MASK_SSEREGPARM },
2502 { "-mstack-arg-probe", MASK_STACK_PROBE },
2503 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2504 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2505 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2506 { "-mvzeroupper", MASK_VZEROUPPER },
2507 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2508 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2509 { "-mprefer-avx128", MASK_PREFER_AVX128},
2512 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2514 char isa_other[40];
2515 char target_other[40];
2516 unsigned num = 0;
2517 unsigned i, j;
2518 char *ret;
2519 char *ptr;
2520 size_t len;
2521 size_t line_len;
2522 size_t sep_len;
2523 const char *abi;
2525 memset (opts, '\0', sizeof (opts));
2527 /* Add -march= option. */
2528 if (arch)
2530 opts[num][0] = "-march=";
2531 opts[num++][1] = arch;
2534 /* Add -mtune= option. */
2535 if (tune)
2537 opts[num][0] = "-mtune=";
2538 opts[num++][1] = tune;
2541 /* Add -m32/-m64/-mx32. */
2542 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2544 if ((isa & OPTION_MASK_ABI_64) != 0)
2545 abi = "-m64";
2546 else
2547 abi = "-mx32";
2548 isa &= ~ (OPTION_MASK_ISA_64BIT
2549 | OPTION_MASK_ABI_64
2550 | OPTION_MASK_ABI_X32);
2552 else
2553 abi = "-m32";
2554 opts[num++][0] = abi;
2556 /* Pick out the options in isa options. */
2557 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2559 if ((isa & isa_opts[i].mask) != 0)
2561 opts[num++][0] = isa_opts[i].option;
2562 isa &= ~ isa_opts[i].mask;
2566 if (isa && add_nl_p)
2568 opts[num++][0] = isa_other;
2569 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2570 isa);
2573 /* Add flag options. */
2574 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2576 if ((flags & flag_opts[i].mask) != 0)
2578 opts[num++][0] = flag_opts[i].option;
2579 flags &= ~ flag_opts[i].mask;
2583 if (flags && add_nl_p)
2585 opts[num++][0] = target_other;
2586 sprintf (target_other, "(other flags: %#x)", flags);
2589 /* Add -fpmath= option. */
2590 if (fpmath)
2592 opts[num][0] = "-mfpmath=";
2593 switch ((int) fpmath)
2595 case FPMATH_387:
2596 opts[num++][1] = "387";
2597 break;
2599 case FPMATH_SSE:
2600 opts[num++][1] = "sse";
2601 break;
2603 case FPMATH_387 | FPMATH_SSE:
2604 opts[num++][1] = "sse+387";
2605 break;
2607 default:
2608 gcc_unreachable ();
2612 /* Any options? */
2613 if (num == 0)
2614 return NULL;
2616 gcc_assert (num < ARRAY_SIZE (opts));
2618 /* Size the string. */
2619 len = 0;
2620 sep_len = (add_nl_p) ? 3 : 1;
2621 for (i = 0; i < num; i++)
2623 len += sep_len;
2624 for (j = 0; j < 2; j++)
2625 if (opts[i][j])
2626 len += strlen (opts[i][j]);
2629 /* Build the string. */
2630 ret = ptr = (char *) xmalloc (len);
2631 line_len = 0;
2633 for (i = 0; i < num; i++)
2635 size_t len2[2];
2637 for (j = 0; j < 2; j++)
2638 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2640 if (i != 0)
2642 *ptr++ = ' ';
2643 line_len++;
2645 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2647 *ptr++ = '\\';
2648 *ptr++ = '\n';
2649 line_len = 0;
2653 for (j = 0; j < 2; j++)
2654 if (opts[i][j])
2656 memcpy (ptr, opts[i][j], len2[j]);
2657 ptr += len2[j];
2658 line_len += len2[j];
2662 *ptr = '\0';
2663 gcc_assert (ret + len >= ptr);
2665 return ret;
2668 /* Return true, if profiling code should be emitted before
2669 prologue. Otherwise it returns false.
2670 Note: For x86 with "hotfix" it is sorried. */
2671 static bool
2672 ix86_profile_before_prologue (void)
2674 return flag_fentry != 0;
2677 /* Function that is callable from the debugger to print the current
2678 options. */
2679 void ATTRIBUTE_UNUSED
2680 ix86_debug_options (void)
2682 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2683 ix86_arch_string, ix86_tune_string,
2684 ix86_fpmath, true);
2686 if (opts)
2688 fprintf (stderr, "%s\n\n", opts);
2689 free (opts);
2691 else
2692 fputs ("<no options>\n\n", stderr);
2694 return;
2697 static const char *stringop_alg_names[] = {
2698 #define DEF_ENUM
2699 #define DEF_ALG(alg, name) #name,
2700 #include "stringop.def"
2701 #undef DEF_ENUM
2702 #undef DEF_ALG
2705 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2706 The string is of the following form (or comma separated list of it):
2708 strategy_alg:max_size:[align|noalign]
2710 where the full size range for the strategy is either [0, max_size] or
2711 [min_size, max_size], in which min_size is the max_size + 1 of the
2712 preceding range. The last size range must have max_size == -1.
2714 Examples:
2717 -mmemcpy-strategy=libcall:-1:noalign
2719 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2723 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2725 This is to tell the compiler to use the following strategy for memset
2726 1) when the expected size is between [1, 16], use rep_8byte strategy;
2727 2) when the size is between [17, 2048], use vector_loop;
2728 3) when the size is > 2048, use libcall. */
2730 struct stringop_size_range
2732 int max;
2733 stringop_alg alg;
2734 bool noalign;
2737 static void
2738 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2740 const struct stringop_algs *default_algs;
2741 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2742 char *curr_range_str, *next_range_str;
2743 int i = 0, n = 0;
2745 if (is_memset)
2746 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2747 else
2748 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2750 curr_range_str = strategy_str;
2754 int maxs;
2755 stringop_alg alg;
2756 char alg_name[128];
2757 char align[16];
2758 next_range_str = strchr (curr_range_str, ',');
2759 if (next_range_str)
2760 *next_range_str++ = '\0';
2762 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2763 alg_name, &maxs, align))
2765 error ("wrong arg %s to option %s", curr_range_str,
2766 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2767 return;
2770 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2772 error ("size ranges of option %s should be increasing",
2773 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2774 return;
2777 for (i = 0; i < last_alg; i++)
2779 if (!strcmp (alg_name, stringop_alg_names[i]))
2781 alg = (stringop_alg) i;
2782 break;
2786 if (i == last_alg)
2788 error ("wrong stringop strategy name %s specified for option %s",
2789 alg_name,
2790 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2791 return;
2794 input_ranges[n].max = maxs;
2795 input_ranges[n].alg = alg;
2796 if (!strcmp (align, "align"))
2797 input_ranges[n].noalign = false;
2798 else if (!strcmp (align, "noalign"))
2799 input_ranges[n].noalign = true;
2800 else
2802 error ("unknown alignment %s specified for option %s",
2803 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2804 return;
2806 n++;
2807 curr_range_str = next_range_str;
2809 while (curr_range_str);
2811 if (input_ranges[n - 1].max != -1)
2813 error ("the max value for the last size range should be -1"
2814 " for option %s",
2815 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2816 return;
2819 if (n > MAX_STRINGOP_ALGS)
2821 error ("too many size ranges specified in option %s",
2822 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2823 return;
2826 /* Now override the default algs array. */
2827 for (i = 0; i < n; i++)
2829 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2830 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2831 = input_ranges[i].alg;
2832 *const_cast<int *>(&default_algs->size[i].noalign)
2833 = input_ranges[i].noalign;
2838 /* parse -mtune-ctrl= option. When DUMP is true,
2839 print the features that are explicitly set. */
2841 static void
2842 parse_mtune_ctrl_str (bool dump)
2844 if (!ix86_tune_ctrl_string)
2845 return;
2847 char *next_feature_string = NULL;
2848 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2849 char *orig = curr_feature_string;
2850 int i;
2853 bool clear = false;
2855 next_feature_string = strchr (curr_feature_string, ',');
2856 if (next_feature_string)
2857 *next_feature_string++ = '\0';
2858 if (*curr_feature_string == '^')
2860 curr_feature_string++;
2861 clear = true;
2863 for (i = 0; i < X86_TUNE_LAST; i++)
2865 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2867 ix86_tune_features[i] = !clear;
2868 if (dump)
2869 fprintf (stderr, "Explicitly %s feature %s\n",
2870 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2871 break;
2874 if (i == X86_TUNE_LAST)
2875 error ("Unknown parameter to option -mtune-ctrl: %s",
2876 clear ? curr_feature_string - 1 : curr_feature_string);
2877 curr_feature_string = next_feature_string;
2879 while (curr_feature_string);
2880 free (orig);
2883 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2884 processor type. */
2886 static void
2887 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2889 unsigned int ix86_tune_mask = 1u << ix86_tune;
2890 int i;
2892 for (i = 0; i < X86_TUNE_LAST; ++i)
2894 if (ix86_tune_no_default)
2895 ix86_tune_features[i] = 0;
2896 else
2897 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2900 if (dump)
2902 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2903 for (i = 0; i < X86_TUNE_LAST; i++)
2904 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2905 ix86_tune_features[i] ? "on" : "off");
2908 parse_mtune_ctrl_str (dump);
2912 /* Override various settings based on options. If MAIN_ARGS_P, the
2913 options are from the command line, otherwise they are from
2914 attributes. */
2916 static void
2917 ix86_option_override_internal (bool main_args_p,
2918 struct gcc_options *opts,
2919 struct gcc_options *opts_set)
2921 int i;
2922 unsigned int ix86_arch_mask;
2923 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
2924 const char *prefix;
2925 const char *suffix;
2926 const char *sw;
2928 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2929 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2930 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2931 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2932 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2933 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2934 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2935 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2936 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2937 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2938 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2939 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2940 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2941 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2942 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2943 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2944 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2945 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2946 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2947 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2948 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2949 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2950 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2951 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2952 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2953 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2954 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2955 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2956 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2957 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2958 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2959 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2960 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2961 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2962 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2963 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2964 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2965 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2966 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2967 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2968 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
2969 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
2970 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
2971 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
2972 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
2974 /* if this reaches 64, need to widen struct pta flags below */
2976 static struct pta
2978 const char *const name; /* processor name or nickname. */
2979 const enum processor_type processor;
2980 const enum attr_cpu schedule;
2981 const unsigned HOST_WIDE_INT flags;
2983 const processor_alias_table[] =
2985 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2986 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2987 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2988 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2989 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2990 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2991 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2992 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2993 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2994 PTA_MMX | PTA_SSE | PTA_FXSR},
2995 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2996 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2997 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2998 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2999 PTA_MMX | PTA_SSE | PTA_FXSR},
3000 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3001 PTA_MMX | PTA_SSE | PTA_FXSR},
3002 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3003 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3004 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3005 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3006 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3007 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3008 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3009 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3010 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3012 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3013 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3014 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3015 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3016 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3017 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3018 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3019 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
3020 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3021 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3022 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3023 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3024 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
3025 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3026 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3027 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3028 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3029 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3030 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3031 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3032 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3033 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3034 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3035 | PTA_XSAVEOPT},
3036 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3037 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3038 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3039 {"slm", PROCESSOR_SLM, CPU_SLM,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3042 | PTA_FXSR},
3043 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3044 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3045 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3046 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3047 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3048 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3049 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3050 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3051 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3052 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3053 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3054 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3055 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3056 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3057 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3058 {"x86-64", PROCESSOR_K8, CPU_K8,
3059 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3060 {"k8", PROCESSOR_K8, CPU_K8,
3061 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3062 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3063 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3064 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3065 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3066 {"opteron", PROCESSOR_K8, CPU_K8,
3067 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3068 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3069 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3070 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3071 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3072 {"athlon64", PROCESSOR_K8, CPU_K8,
3073 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3074 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3075 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3076 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3077 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3078 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3079 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3080 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3081 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3082 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3083 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3084 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3085 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3086 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3087 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3088 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3089 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3090 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3091 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3092 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3093 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3094 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3095 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3096 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3097 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3098 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3099 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3100 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3101 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3102 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3103 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3104 | PTA_XSAVEOPT | PTA_FSGSBASE},
3105 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3106 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3107 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3108 | PTA_FXSR | PTA_XSAVE},
3109 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3110 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3111 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3112 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3113 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3114 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3116 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3117 PTA_64BIT
3118 | PTA_HLE /* flags are only used for -march switch. */ },
3121 /* -mrecip options. */
3122 static struct
3124 const char *string; /* option name */
3125 unsigned int mask; /* mask bits to set */
3127 const recip_options[] =
3129 { "all", RECIP_MASK_ALL },
3130 { "none", RECIP_MASK_NONE },
3131 { "div", RECIP_MASK_DIV },
3132 { "sqrt", RECIP_MASK_SQRT },
3133 { "vec-div", RECIP_MASK_VEC_DIV },
3134 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3137 int const pta_size = ARRAY_SIZE (processor_alias_table);
3139 /* Set up prefix/suffix so the error messages refer to either the command
3140 line argument, or the attribute(target). */
3141 if (main_args_p)
3143 prefix = "-m";
3144 suffix = "";
3145 sw = "switch";
3147 else
3149 prefix = "option(\"";
3150 suffix = "\")";
3151 sw = "attribute";
3154 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3155 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3156 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3157 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3158 #ifdef TARGET_BI_ARCH
3159 else
3161 #if TARGET_BI_ARCH == 1
3162 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3163 is on and OPTION_MASK_ABI_X32 is off. We turn off
3164 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3165 -mx32. */
3166 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3167 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3168 #else
3169 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3170 on and OPTION_MASK_ABI_64 is off. We turn off
3171 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3172 -m64. */
3173 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3174 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3175 #endif
3177 #endif
3179 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3181 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3182 OPTION_MASK_ABI_64 for TARGET_X32. */
3183 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3184 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3186 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3188 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3189 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3190 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3191 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3194 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3195 SUBTARGET_OVERRIDE_OPTIONS;
3196 #endif
3198 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3199 SUBSUBTARGET_OVERRIDE_OPTIONS;
3200 #endif
3202 /* -fPIC is the default for x86_64. */
3203 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3204 opts->x_flag_pic = 2;
3206 /* Need to check -mtune=generic first. */
3207 if (opts->x_ix86_tune_string)
3209 if (!strcmp (opts->x_ix86_tune_string, "generic")
3210 || !strcmp (opts->x_ix86_tune_string, "i686")
3211 /* As special support for cross compilers we read -mtune=native
3212 as -mtune=generic. With native compilers we won't see the
3213 -mtune=native, as it was changed by the driver. */
3214 || !strcmp (opts->x_ix86_tune_string, "native"))
3216 opts->x_ix86_tune_string = "generic";
3218 /* If this call is for setting the option attribute, allow the
3219 generic that was previously set. */
3220 else if (!main_args_p
3221 && !strcmp (opts->x_ix86_tune_string, "generic"))
3223 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3224 error ("bad value (%s) for %stune=%s %s",
3225 opts->x_ix86_tune_string, prefix, suffix, sw);
3226 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3227 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3228 "%stune=k8%s or %stune=generic%s instead as appropriate",
3229 prefix, suffix, prefix, suffix, prefix, suffix);
3231 else
3233 if (opts->x_ix86_arch_string)
3234 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3235 if (!opts->x_ix86_tune_string)
3237 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3238 ix86_tune_defaulted = 1;
3241 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3242 or defaulted. We need to use a sensible tune option. */
3243 if (!strcmp (opts->x_ix86_tune_string, "generic")
3244 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3245 || !strcmp (opts->x_ix86_tune_string, "i686"))
3247 opts->x_ix86_tune_string = "generic";
3251 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3252 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3254 /* rep; movq isn't available in 32-bit code. */
3255 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3256 opts->x_ix86_stringop_alg = no_stringop;
3259 if (!opts->x_ix86_arch_string)
3260 opts->x_ix86_arch_string
3261 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3262 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3263 else
3264 ix86_arch_specified = 1;
3266 if (opts_set->x_ix86_pmode)
3268 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3269 && opts->x_ix86_pmode == PMODE_SI)
3270 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3271 && opts->x_ix86_pmode == PMODE_DI))
3272 error ("address mode %qs not supported in the %s bit mode",
3273 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3274 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3276 else
3277 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3278 ? PMODE_DI : PMODE_SI;
3280 if (!opts_set->x_ix86_abi)
3281 opts->x_ix86_abi = DEFAULT_ABI;
3283 /* For targets using ms ABI enable ms-extensions, if not
3284 explicit turned off. For non-ms ABI we turn off this
3285 option. */
3286 if (!opts_set->x_flag_ms_extensions)
3287 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3289 if (opts_set->x_ix86_cmodel)
3291 switch (opts->x_ix86_cmodel)
3293 case CM_SMALL:
3294 case CM_SMALL_PIC:
3295 if (opts->x_flag_pic)
3296 opts->x_ix86_cmodel = CM_SMALL_PIC;
3297 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3298 error ("code model %qs not supported in the %s bit mode",
3299 "small", "32");
3300 break;
3302 case CM_MEDIUM:
3303 case CM_MEDIUM_PIC:
3304 if (opts->x_flag_pic)
3305 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3306 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3307 error ("code model %qs not supported in the %s bit mode",
3308 "medium", "32");
3309 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3310 error ("code model %qs not supported in x32 mode",
3311 "medium");
3312 break;
3314 case CM_LARGE:
3315 case CM_LARGE_PIC:
3316 if (opts->x_flag_pic)
3317 opts->x_ix86_cmodel = CM_LARGE_PIC;
3318 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3319 error ("code model %qs not supported in the %s bit mode",
3320 "large", "32");
3321 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3322 error ("code model %qs not supported in x32 mode",
3323 "large");
3324 break;
3326 case CM_32:
3327 if (opts->x_flag_pic)
3328 error ("code model %s does not support PIC mode", "32");
3329 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3330 error ("code model %qs not supported in the %s bit mode",
3331 "32", "64");
3332 break;
3334 case CM_KERNEL:
3335 if (opts->x_flag_pic)
3337 error ("code model %s does not support PIC mode", "kernel");
3338 opts->x_ix86_cmodel = CM_32;
3340 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3341 error ("code model %qs not supported in the %s bit mode",
3342 "kernel", "32");
3343 break;
3345 default:
3346 gcc_unreachable ();
3349 else
3351 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3352 use of rip-relative addressing. This eliminates fixups that
3353 would otherwise be needed if this object is to be placed in a
3354 DLL, and is essentially just as efficient as direct addressing. */
3355 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3356 && (TARGET_RDOS || TARGET_PECOFF))
3357 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3358 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3359 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3360 else
3361 opts->x_ix86_cmodel = CM_32;
3363 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3365 error ("-masm=intel not supported in this configuration");
3366 opts->x_ix86_asm_dialect = ASM_ATT;
3368 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3369 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3370 sorry ("%i-bit mode not compiled in",
3371 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3373 for (i = 0; i < pta_size; i++)
3374 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3376 ix86_schedule = processor_alias_table[i].schedule;
3377 ix86_arch = processor_alias_table[i].processor;
3378 /* Default cpu tuning to the architecture. */
3379 ix86_tune = ix86_arch;
3381 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3382 && !(processor_alias_table[i].flags & PTA_64BIT))
3383 error ("CPU you selected does not support x86-64 "
3384 "instruction set");
3386 if (processor_alias_table[i].flags & PTA_MMX
3387 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3388 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3389 if (processor_alias_table[i].flags & PTA_3DNOW
3390 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3391 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3392 if (processor_alias_table[i].flags & PTA_3DNOW_A
3393 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3394 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3395 if (processor_alias_table[i].flags & PTA_SSE
3396 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3397 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3398 if (processor_alias_table[i].flags & PTA_SSE2
3399 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3400 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3401 if (processor_alias_table[i].flags & PTA_SSE3
3402 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3403 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3404 if (processor_alias_table[i].flags & PTA_SSSE3
3405 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3406 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3407 if (processor_alias_table[i].flags & PTA_SSE4_1
3408 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3409 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3410 if (processor_alias_table[i].flags & PTA_SSE4_2
3411 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3412 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3413 if (processor_alias_table[i].flags & PTA_AVX
3414 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3415 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3416 if (processor_alias_table[i].flags & PTA_AVX2
3417 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3418 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3419 if (processor_alias_table[i].flags & PTA_FMA
3420 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3421 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3422 if (processor_alias_table[i].flags & PTA_SSE4A
3423 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3424 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3425 if (processor_alias_table[i].flags & PTA_FMA4
3426 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3427 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3428 if (processor_alias_table[i].flags & PTA_XOP
3429 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3430 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3431 if (processor_alias_table[i].flags & PTA_LWP
3432 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3433 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3434 if (processor_alias_table[i].flags & PTA_ABM
3435 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3436 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3437 if (processor_alias_table[i].flags & PTA_BMI
3438 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3439 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3440 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3441 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3442 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3443 if (processor_alias_table[i].flags & PTA_TBM
3444 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3445 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3446 if (processor_alias_table[i].flags & PTA_BMI2
3447 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3448 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3449 if (processor_alias_table[i].flags & PTA_CX16
3450 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3451 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3452 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3453 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3454 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3455 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3456 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3457 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3458 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3459 if (processor_alias_table[i].flags & PTA_MOVBE
3460 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3461 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3462 if (processor_alias_table[i].flags & PTA_AES
3463 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3464 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3465 if (processor_alias_table[i].flags & PTA_PCLMUL
3466 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3467 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3468 if (processor_alias_table[i].flags & PTA_FSGSBASE
3469 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3470 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3471 if (processor_alias_table[i].flags & PTA_RDRND
3472 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3473 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3474 if (processor_alias_table[i].flags & PTA_F16C
3475 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3476 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3477 if (processor_alias_table[i].flags & PTA_RTM
3478 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3479 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3480 if (processor_alias_table[i].flags & PTA_HLE
3481 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3482 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3483 if (processor_alias_table[i].flags & PTA_PRFCHW
3484 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3485 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3486 if (processor_alias_table[i].flags & PTA_RDSEED
3487 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3488 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3489 if (processor_alias_table[i].flags & PTA_ADX
3490 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3491 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3492 if (processor_alias_table[i].flags & PTA_FXSR
3493 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3494 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3495 if (processor_alias_table[i].flags & PTA_XSAVE
3496 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3497 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3498 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3499 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3500 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3501 if (processor_alias_table[i].flags & PTA_AVX512F
3502 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3503 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3504 if (processor_alias_table[i].flags & PTA_AVX512ER
3505 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3506 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3507 if (processor_alias_table[i].flags & PTA_AVX512PF
3508 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3509 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3510 if (processor_alias_table[i].flags & PTA_AVX512CD
3511 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3512 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3513 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3514 x86_prefetch_sse = true;
3516 break;
3519 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3520 error ("generic CPU can be used only for %stune=%s %s",
3521 prefix, suffix, sw);
3522 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3523 error ("bad value (%s) for %sarch=%s %s",
3524 opts->x_ix86_arch_string, prefix, suffix, sw);
3526 ix86_arch_mask = 1u << ix86_arch;
3527 for (i = 0; i < X86_ARCH_LAST; ++i)
3528 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3530 for (i = 0; i < pta_size; i++)
3531 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3533 ix86_schedule = processor_alias_table[i].schedule;
3534 ix86_tune = processor_alias_table[i].processor;
3535 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3537 if (!(processor_alias_table[i].flags & PTA_64BIT))
3539 if (ix86_tune_defaulted)
3541 opts->x_ix86_tune_string = "x86-64";
3542 for (i = 0; i < pta_size; i++)
3543 if (! strcmp (opts->x_ix86_tune_string,
3544 processor_alias_table[i].name))
3545 break;
3546 ix86_schedule = processor_alias_table[i].schedule;
3547 ix86_tune = processor_alias_table[i].processor;
3549 else
3550 error ("CPU you selected does not support x86-64 "
3551 "instruction set");
3554 /* Intel CPUs have always interpreted SSE prefetch instructions as
3555 NOPs; so, we can enable SSE prefetch instructions even when
3556 -mtune (rather than -march) points us to a processor that has them.
3557 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3558 higher processors. */
3559 if (TARGET_CMOV
3560 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3561 x86_prefetch_sse = true;
3562 break;
3565 if (ix86_tune_specified && i == pta_size)
3566 error ("bad value (%s) for %stune=%s %s",
3567 opts->x_ix86_tune_string, prefix, suffix, sw);
3569 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3571 #ifndef USE_IX86_FRAME_POINTER
3572 #define USE_IX86_FRAME_POINTER 0
3573 #endif
3575 #ifndef USE_X86_64_FRAME_POINTER
3576 #define USE_X86_64_FRAME_POINTER 0
3577 #endif
3579 /* Set the default values for switches whose default depends on TARGET_64BIT
3580 in case they weren't overwritten by command line options. */
3581 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3583 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3584 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3585 if (opts->x_flag_asynchronous_unwind_tables == 2)
3586 opts->x_flag_unwind_tables
3587 = opts->x_flag_asynchronous_unwind_tables = 1;
3588 if (opts->x_flag_pcc_struct_return == 2)
3589 opts->x_flag_pcc_struct_return = 0;
3591 else
3593 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3594 opts->x_flag_omit_frame_pointer
3595 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3596 if (opts->x_flag_asynchronous_unwind_tables == 2)
3597 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3598 if (opts->x_flag_pcc_struct_return == 2)
3599 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3602 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3603 if (opts->x_optimize_size)
3604 ix86_cost = &ix86_size_cost;
3605 else
3606 ix86_cost = ix86_tune_cost;
3608 /* Arrange to set up i386_stack_locals for all functions. */
3609 init_machine_status = ix86_init_machine_status;
3611 /* Validate -mregparm= value. */
3612 if (opts_set->x_ix86_regparm)
3614 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3615 warning (0, "-mregparm is ignored in 64-bit mode");
3616 if (opts->x_ix86_regparm > REGPARM_MAX)
3618 error ("-mregparm=%d is not between 0 and %d",
3619 opts->x_ix86_regparm, REGPARM_MAX);
3620 opts->x_ix86_regparm = 0;
3623 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3624 opts->x_ix86_regparm = REGPARM_MAX;
3626 /* Default align_* from the processor table. */
3627 if (opts->x_align_loops == 0)
3629 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3630 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3632 if (opts->x_align_jumps == 0)
3634 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3635 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3637 if (opts->x_align_functions == 0)
3639 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3642 /* Provide default for -mbranch-cost= value. */
3643 if (!opts_set->x_ix86_branch_cost)
3644 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3646 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3648 opts->x_target_flags
3649 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3651 /* Enable by default the SSE and MMX builtins. Do allow the user to
3652 explicitly disable any of these. In particular, disabling SSE and
3653 MMX for kernel code is extremely useful. */
3654 if (!ix86_arch_specified)
3655 opts->x_ix86_isa_flags
3656 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3657 | TARGET_SUBTARGET64_ISA_DEFAULT)
3658 & ~opts->x_ix86_isa_flags_explicit);
3660 if (TARGET_RTD_P (opts->x_target_flags))
3661 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3663 else
3665 opts->x_target_flags
3666 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3668 if (!ix86_arch_specified)
3669 opts->x_ix86_isa_flags
3670 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3672 /* i386 ABI does not specify red zone. It still makes sense to use it
3673 when programmer takes care to stack from being destroyed. */
3674 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3675 opts->x_target_flags |= MASK_NO_RED_ZONE;
3678 /* Keep nonleaf frame pointers. */
3679 if (opts->x_flag_omit_frame_pointer)
3680 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3681 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3682 opts->x_flag_omit_frame_pointer = 1;
3684 /* If we're doing fast math, we don't care about comparison order
3685 wrt NaNs. This lets us use a shorter comparison sequence. */
3686 if (opts->x_flag_finite_math_only)
3687 opts->x_target_flags &= ~MASK_IEEE_FP;
3689 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3690 since the insns won't need emulation. */
3691 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3692 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3694 /* Likewise, if the target doesn't have a 387, or we've specified
3695 software floating point, don't use 387 inline intrinsics. */
3696 if (!TARGET_80387_P (opts->x_target_flags))
3697 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3699 /* Turn on MMX builtins for -msse. */
3700 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3701 opts->x_ix86_isa_flags
3702 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3704 /* Enable SSE prefetch. */
3705 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3706 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3707 x86_prefetch_sse = true;
3709 /* Enable prefetch{,w} instructions for -m3dnow. */
3710 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3711 opts->x_ix86_isa_flags
3712 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3714 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3715 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3716 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3717 opts->x_ix86_isa_flags
3718 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3720 /* Enable lzcnt instruction for -mabm. */
3721 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3722 opts->x_ix86_isa_flags
3723 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3725 /* Validate -mpreferred-stack-boundary= value or default it to
3726 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3727 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3728 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3730 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3731 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3732 int max = (TARGET_SEH ? 4 : 12);
3734 if (opts->x_ix86_preferred_stack_boundary_arg < min
3735 || opts->x_ix86_preferred_stack_boundary_arg > max)
3737 if (min == max)
3738 error ("-mpreferred-stack-boundary is not supported "
3739 "for this target");
3740 else
3741 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3742 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3744 else
3745 ix86_preferred_stack_boundary
3746 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3749 /* Set the default value for -mstackrealign. */
3750 if (opts->x_ix86_force_align_arg_pointer == -1)
3751 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3753 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3755 /* Validate -mincoming-stack-boundary= value or default it to
3756 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3757 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3758 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3760 if (ix86_incoming_stack_boundary_arg
3761 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3762 || ix86_incoming_stack_boundary_arg > 12)
3763 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3764 ix86_incoming_stack_boundary_arg,
3765 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3766 else
3768 ix86_user_incoming_stack_boundary
3769 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3770 ix86_incoming_stack_boundary
3771 = ix86_user_incoming_stack_boundary;
3775 /* Accept -msseregparm only if at least SSE support is enabled. */
3776 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3777 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3778 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3780 if (opts_set->x_ix86_fpmath)
3782 if (opts->x_ix86_fpmath & FPMATH_SSE)
3784 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3786 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3787 opts->x_ix86_fpmath = FPMATH_387;
3789 else if ((opts->x_ix86_fpmath & FPMATH_387)
3790 && !TARGET_80387_P (opts->x_target_flags))
3792 warning (0, "387 instruction set disabled, using SSE arithmetics");
3793 opts->x_ix86_fpmath = FPMATH_SSE;
3797 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3798 fpmath=387. The second is however default at many targets since the
3799 extra 80bit precision of temporaries is considered to be part of ABI.
3800 Overwrite the default at least for -ffast-math.
3801 TODO: -mfpmath=both seems to produce same performing code with bit
3802 smaller binaries. It is however not clear if register allocation is
3803 ready for this setting.
3804 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3805 codegen. We may switch to 387 with -ffast-math for size optimized
3806 functions. */
3807 else if (fast_math_flags_set_p (&global_options)
3808 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3809 ix86_fpmath = FPMATH_SSE;
3810 else
3811 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3813 /* If the i387 is disabled, then do not return values in it. */
3814 if (!TARGET_80387_P (opts->x_target_flags))
3815 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3817 /* Use external vectorized library in vectorizing intrinsics. */
3818 if (opts_set->x_ix86_veclibabi_type)
3819 switch (opts->x_ix86_veclibabi_type)
3821 case ix86_veclibabi_type_svml:
3822 ix86_veclib_handler = ix86_veclibabi_svml;
3823 break;
3825 case ix86_veclibabi_type_acml:
3826 ix86_veclib_handler = ix86_veclibabi_acml;
3827 break;
3829 default:
3830 gcc_unreachable ();
3833 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3834 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3835 && !opts->x_optimize_size)
3836 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3838 /* If stack probes are required, the space used for large function
3839 arguments on the stack must also be probed, so enable
3840 -maccumulate-outgoing-args so this happens in the prologue. */
3841 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3842 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3844 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3845 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3846 "for correctness", prefix, suffix);
3847 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3850 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3852 char *p;
3853 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3854 p = strchr (internal_label_prefix, 'X');
3855 internal_label_prefix_len = p - internal_label_prefix;
3856 *p = '\0';
3859 /* When scheduling description is not available, disable scheduler pass
3860 so it won't slow down the compilation and make x87 code slower. */
3861 if (!TARGET_SCHEDULE)
3862 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3864 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3865 ix86_tune_cost->simultaneous_prefetches,
3866 opts->x_param_values,
3867 opts_set->x_param_values);
3868 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3869 ix86_tune_cost->prefetch_block,
3870 opts->x_param_values,
3871 opts_set->x_param_values);
3872 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3873 ix86_tune_cost->l1_cache_size,
3874 opts->x_param_values,
3875 opts_set->x_param_values);
3876 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3877 ix86_tune_cost->l2_cache_size,
3878 opts->x_param_values,
3879 opts_set->x_param_values);
3881 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3882 if (opts->x_flag_prefetch_loop_arrays < 0
3883 && HAVE_prefetch
3884 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
3885 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3886 opts->x_flag_prefetch_loop_arrays = 1;
3888 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3889 can be opts->x_optimized to ap = __builtin_next_arg (0). */
3890 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
3891 targetm.expand_builtin_va_start = NULL;
3893 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3895 ix86_gen_leave = gen_leave_rex64;
3896 if (Pmode == DImode)
3898 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3899 ix86_gen_tls_local_dynamic_base_64
3900 = gen_tls_local_dynamic_base_64_di;
3902 else
3904 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3905 ix86_gen_tls_local_dynamic_base_64
3906 = gen_tls_local_dynamic_base_64_si;
3909 else
3910 ix86_gen_leave = gen_leave;
3912 if (Pmode == DImode)
3914 ix86_gen_add3 = gen_adddi3;
3915 ix86_gen_sub3 = gen_subdi3;
3916 ix86_gen_sub3_carry = gen_subdi3_carry;
3917 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3918 ix86_gen_andsp = gen_anddi3;
3919 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3920 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3921 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3922 ix86_gen_monitor = gen_sse3_monitor_di;
3924 else
3926 ix86_gen_add3 = gen_addsi3;
3927 ix86_gen_sub3 = gen_subsi3;
3928 ix86_gen_sub3_carry = gen_subsi3_carry;
3929 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3930 ix86_gen_andsp = gen_andsi3;
3931 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3932 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3933 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3934 ix86_gen_monitor = gen_sse3_monitor_si;
3937 #ifdef USE_IX86_CLD
3938 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3939 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3940 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
3941 #endif
3943 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
3945 if (opts->x_flag_fentry > 0)
3946 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3947 "with -fpic");
3948 opts->x_flag_fentry = 0;
3950 else if (TARGET_SEH)
3952 if (opts->x_flag_fentry == 0)
3953 sorry ("-mno-fentry isn%'t compatible with SEH");
3954 opts->x_flag_fentry = 1;
3956 else if (opts->x_flag_fentry < 0)
3958 #if defined(PROFILE_BEFORE_PROLOGUE)
3959 opts->x_flag_fentry = 1;
3960 #else
3961 opts->x_flag_fentry = 0;
3962 #endif
3965 /* When not opts->x_optimize for size, enable vzeroupper optimization for
3966 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3967 AVX unaligned load/store. */
3968 if (!opts->x_optimize_size)
3970 if (flag_expensive_optimizations
3971 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
3972 opts->x_target_flags |= MASK_VZEROUPPER;
3973 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
3974 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3975 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3976 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
3977 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3978 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3979 /* Enable 128-bit AVX instruction generation
3980 for the auto-vectorizer. */
3981 if (TARGET_AVX128_OPTIMAL
3982 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
3983 opts->x_target_flags |= MASK_PREFER_AVX128;
3986 if (opts->x_ix86_recip_name)
3988 char *p = ASTRDUP (opts->x_ix86_recip_name);
3989 char *q;
3990 unsigned int mask, i;
3991 bool invert;
3993 while ((q = strtok (p, ",")) != NULL)
3995 p = NULL;
3996 if (*q == '!')
3998 invert = true;
3999 q++;
4001 else
4002 invert = false;
4004 if (!strcmp (q, "default"))
4005 mask = RECIP_MASK_ALL;
4006 else
4008 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4009 if (!strcmp (q, recip_options[i].string))
4011 mask = recip_options[i].mask;
4012 break;
4015 if (i == ARRAY_SIZE (recip_options))
4017 error ("unknown option for -mrecip=%s", q);
4018 invert = false;
4019 mask = RECIP_MASK_NONE;
4023 opts->x_recip_mask_explicit |= mask;
4024 if (invert)
4025 opts->x_recip_mask &= ~mask;
4026 else
4027 opts->x_recip_mask |= mask;
4031 if (TARGET_RECIP_P (opts->x_target_flags))
4032 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4033 else if (opts_set->x_target_flags & MASK_RECIP)
4034 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4036 /* Default long double to 64-bit for Bionic. */
4037 if (TARGET_HAS_BIONIC
4038 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4039 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4041 /* Save the initial options in case the user does function specific
4042 options. */
4043 if (main_args_p)
4044 target_option_default_node = target_option_current_node
4045 = build_target_option_node (opts);
4047 /* Handle stack protector */
4048 if (!opts_set->x_ix86_stack_protector_guard)
4049 opts->x_ix86_stack_protector_guard
4050 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4052 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4053 if (opts->x_ix86_tune_memcpy_strategy)
4055 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4056 ix86_parse_stringop_strategy_string (str, false);
4057 free (str);
4060 if (opts->x_ix86_tune_memset_strategy)
4062 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4063 ix86_parse_stringop_strategy_string (str, true);
4064 free (str);
4068 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4070 static void
4071 ix86_option_override (void)
4073 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4074 static struct register_pass_info insert_vzeroupper_info
4075 = { pass_insert_vzeroupper, "reload",
4076 1, PASS_POS_INSERT_AFTER
4079 ix86_option_override_internal (true, &global_options, &global_options_set);
4082 /* This needs to be done at start up. It's convenient to do it here. */
4083 register_pass (&insert_vzeroupper_info);
4086 /* Update register usage after having seen the compiler flags. */
4088 static void
4089 ix86_conditional_register_usage (void)
4091 int i, c_mask;
4092 unsigned int j;
4094 /* The PIC register, if it exists, is fixed. */
4095 j = PIC_OFFSET_TABLE_REGNUM;
4096 if (j != INVALID_REGNUM)
4097 fixed_regs[j] = call_used_regs[j] = 1;
4099 /* For 32-bit targets, squash the REX registers. */
4100 if (! TARGET_64BIT)
4102 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4103 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4104 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4105 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4106 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4107 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4110 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4111 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4112 : TARGET_64BIT ? (1 << 2)
4113 : (1 << 1));
4115 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4117 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4119 /* Set/reset conditionally defined registers from
4120 CALL_USED_REGISTERS initializer. */
4121 if (call_used_regs[i] > 1)
4122 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4124 /* Calculate registers of CLOBBERED_REGS register set
4125 as call used registers from GENERAL_REGS register set. */
4126 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4127 && call_used_regs[i])
4128 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4131 /* If MMX is disabled, squash the registers. */
4132 if (! TARGET_MMX)
4133 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4134 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4135 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4137 /* If SSE is disabled, squash the registers. */
4138 if (! TARGET_SSE)
4139 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4140 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4141 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4143 /* If the FPU is disabled, squash the registers. */
4144 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4145 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4146 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4147 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4149 /* If AVX512F is disabled, squash the registers. */
4150 if (! TARGET_AVX512F)
4152 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4153 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4155 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4156 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4159 /* If MPX is disabled, squash the registers. */
4160 if (! TARGET_MPX)
4161 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4162 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4166 /* Save the current options */
4168 static void
4169 ix86_function_specific_save (struct cl_target_option *ptr,
4170 struct gcc_options *opts)
4172 ptr->arch = ix86_arch;
4173 ptr->schedule = ix86_schedule;
4174 ptr->tune = ix86_tune;
4175 ptr->branch_cost = ix86_branch_cost;
4176 ptr->tune_defaulted = ix86_tune_defaulted;
4177 ptr->arch_specified = ix86_arch_specified;
4178 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4179 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4180 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4182 /* The fields are char but the variables are not; make sure the
4183 values fit in the fields. */
4184 gcc_assert (ptr->arch == ix86_arch);
4185 gcc_assert (ptr->schedule == ix86_schedule);
4186 gcc_assert (ptr->tune == ix86_tune);
4187 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4190 /* Restore the current options */
4192 static void
4193 ix86_function_specific_restore (struct gcc_options *opts,
4194 struct cl_target_option *ptr)
4196 enum processor_type old_tune = ix86_tune;
4197 enum processor_type old_arch = ix86_arch;
4198 unsigned int ix86_arch_mask;
4199 int i;
4201 ix86_arch = (enum processor_type) ptr->arch;
4202 ix86_schedule = (enum attr_cpu) ptr->schedule;
4203 ix86_tune = (enum processor_type) ptr->tune;
4204 opts->x_ix86_branch_cost = ptr->branch_cost;
4205 ix86_tune_defaulted = ptr->tune_defaulted;
4206 ix86_arch_specified = ptr->arch_specified;
4207 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4208 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4209 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4211 /* Recreate the arch feature tests if the arch changed */
4212 if (old_arch != ix86_arch)
4214 ix86_arch_mask = 1u << ix86_arch;
4215 for (i = 0; i < X86_ARCH_LAST; ++i)
4216 ix86_arch_features[i]
4217 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4220 /* Recreate the tune optimization tests */
4221 if (old_tune != ix86_tune)
4222 set_ix86_tune_features (ix86_tune, false);
4225 /* Print the current options */
4227 static void
4228 ix86_function_specific_print (FILE *file, int indent,
4229 struct cl_target_option *ptr)
4231 char *target_string
4232 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4233 NULL, NULL, ptr->x_ix86_fpmath, false);
4235 fprintf (file, "%*sarch = %d (%s)\n",
4236 indent, "",
4237 ptr->arch,
4238 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4239 ? cpu_names[ptr->arch]
4240 : "<unknown>"));
4242 fprintf (file, "%*stune = %d (%s)\n",
4243 indent, "",
4244 ptr->tune,
4245 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4246 ? cpu_names[ptr->tune]
4247 : "<unknown>"));
4249 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4251 if (target_string)
4253 fprintf (file, "%*s%s\n", indent, "", target_string);
4254 free (target_string);
4259 /* Inner function to process the attribute((target(...))), take an argument and
4260 set the current options from the argument. If we have a list, recursively go
4261 over the list. */
4263 static bool
4264 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4265 struct gcc_options *opts,
4266 struct gcc_options *opts_set,
4267 struct gcc_options *enum_opts_set)
4269 char *next_optstr;
4270 bool ret = true;
4272 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4273 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4274 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4275 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4276 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4278 enum ix86_opt_type
4280 ix86_opt_unknown,
4281 ix86_opt_yes,
4282 ix86_opt_no,
4283 ix86_opt_str,
4284 ix86_opt_enum,
4285 ix86_opt_isa
4288 static const struct
4290 const char *string;
4291 size_t len;
4292 enum ix86_opt_type type;
4293 int opt;
4294 int mask;
4295 } attrs[] = {
4296 /* isa options */
4297 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4298 IX86_ATTR_ISA ("abm", OPT_mabm),
4299 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4300 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4301 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4302 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4303 IX86_ATTR_ISA ("aes", OPT_maes),
4304 IX86_ATTR_ISA ("avx", OPT_mavx),
4305 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4306 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4307 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4308 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4309 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4310 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4311 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4312 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4313 IX86_ATTR_ISA ("sse", OPT_msse),
4314 IX86_ATTR_ISA ("sse2", OPT_msse2),
4315 IX86_ATTR_ISA ("sse3", OPT_msse3),
4316 IX86_ATTR_ISA ("sse4", OPT_msse4),
4317 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4318 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4319 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4320 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4321 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4322 IX86_ATTR_ISA ("fma", OPT_mfma),
4323 IX86_ATTR_ISA ("xop", OPT_mxop),
4324 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4325 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4326 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4327 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4328 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4329 IX86_ATTR_ISA ("hle", OPT_mhle),
4330 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4331 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4332 IX86_ATTR_ISA ("adx", OPT_madx),
4333 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4334 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4335 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4337 /* enum options */
4338 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4340 /* string options */
4341 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4342 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4344 /* flag options */
4345 IX86_ATTR_YES ("cld",
4346 OPT_mcld,
4347 MASK_CLD),
4349 IX86_ATTR_NO ("fancy-math-387",
4350 OPT_mfancy_math_387,
4351 MASK_NO_FANCY_MATH_387),
4353 IX86_ATTR_YES ("ieee-fp",
4354 OPT_mieee_fp,
4355 MASK_IEEE_FP),
4357 IX86_ATTR_YES ("inline-all-stringops",
4358 OPT_minline_all_stringops,
4359 MASK_INLINE_ALL_STRINGOPS),
4361 IX86_ATTR_YES ("inline-stringops-dynamically",
4362 OPT_minline_stringops_dynamically,
4363 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4365 IX86_ATTR_NO ("align-stringops",
4366 OPT_mno_align_stringops,
4367 MASK_NO_ALIGN_STRINGOPS),
4369 IX86_ATTR_YES ("recip",
4370 OPT_mrecip,
4371 MASK_RECIP),
4375 /* If this is a list, recurse to get the options. */
4376 if (TREE_CODE (args) == TREE_LIST)
4378 bool ret = true;
4380 for (; args; args = TREE_CHAIN (args))
4381 if (TREE_VALUE (args)
4382 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4383 p_strings, opts, opts_set,
4384 enum_opts_set))
4385 ret = false;
4387 return ret;
4390 else if (TREE_CODE (args) != STRING_CST)
4392 error ("attribute %<target%> argument not a string");
4393 return false;
4396 /* Handle multiple arguments separated by commas. */
4397 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4399 while (next_optstr && *next_optstr != '\0')
4401 char *p = next_optstr;
4402 char *orig_p = p;
4403 char *comma = strchr (next_optstr, ',');
4404 const char *opt_string;
4405 size_t len, opt_len;
4406 int opt;
4407 bool opt_set_p;
4408 char ch;
4409 unsigned i;
4410 enum ix86_opt_type type = ix86_opt_unknown;
4411 int mask = 0;
4413 if (comma)
4415 *comma = '\0';
4416 len = comma - next_optstr;
4417 next_optstr = comma + 1;
4419 else
4421 len = strlen (p);
4422 next_optstr = NULL;
4425 /* Recognize no-xxx. */
4426 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4428 opt_set_p = false;
4429 p += 3;
4430 len -= 3;
4432 else
4433 opt_set_p = true;
4435 /* Find the option. */
4436 ch = *p;
4437 opt = N_OPTS;
4438 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4440 type = attrs[i].type;
4441 opt_len = attrs[i].len;
4442 if (ch == attrs[i].string[0]
4443 && ((type != ix86_opt_str && type != ix86_opt_enum)
4444 ? len == opt_len
4445 : len > opt_len)
4446 && memcmp (p, attrs[i].string, opt_len) == 0)
4448 opt = attrs[i].opt;
4449 mask = attrs[i].mask;
4450 opt_string = attrs[i].string;
4451 break;
4455 /* Process the option. */
4456 if (opt == N_OPTS)
4458 error ("attribute(target(\"%s\")) is unknown", orig_p);
4459 ret = false;
4462 else if (type == ix86_opt_isa)
4464 struct cl_decoded_option decoded;
4466 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4467 ix86_handle_option (opts, opts_set,
4468 &decoded, input_location);
4471 else if (type == ix86_opt_yes || type == ix86_opt_no)
4473 if (type == ix86_opt_no)
4474 opt_set_p = !opt_set_p;
4476 if (opt_set_p)
4477 opts->x_target_flags |= mask;
4478 else
4479 opts->x_target_flags &= ~mask;
4482 else if (type == ix86_opt_str)
4484 if (p_strings[opt])
4486 error ("option(\"%s\") was already specified", opt_string);
4487 ret = false;
4489 else
4490 p_strings[opt] = xstrdup (p + opt_len);
4493 else if (type == ix86_opt_enum)
4495 bool arg_ok;
4496 int value;
4498 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4499 if (arg_ok)
4500 set_option (opts, enum_opts_set, opt, value,
4501 p + opt_len, DK_UNSPECIFIED, input_location,
4502 global_dc);
4503 else
4505 error ("attribute(target(\"%s\")) is unknown", orig_p);
4506 ret = false;
4510 else
4511 gcc_unreachable ();
4514 return ret;
4517 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4519 tree
4520 ix86_valid_target_attribute_tree (tree args,
4521 struct gcc_options *opts,
4522 struct gcc_options *opts_set)
4524 const char *orig_arch_string = ix86_arch_string;
4525 const char *orig_tune_string = ix86_tune_string;
4526 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4527 int orig_tune_defaulted = ix86_tune_defaulted;
4528 int orig_arch_specified = ix86_arch_specified;
4529 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4530 tree t = NULL_TREE;
4531 int i;
4532 struct cl_target_option *def
4533 = TREE_TARGET_OPTION (target_option_default_node);
4534 struct gcc_options enum_opts_set;
4536 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4538 /* Process each of the options on the chain. */
4539 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4540 opts_set, &enum_opts_set))
4541 return error_mark_node;
4543 /* If the changed options are different from the default, rerun
4544 ix86_option_override_internal, and then save the options away.
4545 The string options are are attribute options, and will be undone
4546 when we copy the save structure. */
4547 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4548 || opts->x_target_flags != def->x_target_flags
4549 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4550 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4551 || enum_opts_set.x_ix86_fpmath)
4553 /* If we are using the default tune= or arch=, undo the string assigned,
4554 and use the default. */
4555 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4556 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4557 else if (!orig_arch_specified)
4558 opts->x_ix86_arch_string = NULL;
4560 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4561 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4562 else if (orig_tune_defaulted)
4563 opts->x_ix86_tune_string = NULL;
4565 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4566 if (enum_opts_set.x_ix86_fpmath)
4567 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4568 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4569 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4571 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4572 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4575 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4576 ix86_option_override_internal (false, opts, opts_set);
4578 /* Add any builtin functions with the new isa if any. */
4579 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4581 /* Save the current options unless we are validating options for
4582 #pragma. */
4583 t = build_target_option_node (opts);
4585 opts->x_ix86_arch_string = orig_arch_string;
4586 opts->x_ix86_tune_string = orig_tune_string;
4587 opts_set->x_ix86_fpmath = orig_fpmath_set;
4589 /* Free up memory allocated to hold the strings */
4590 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4591 free (option_strings[i]);
4594 return t;
4597 /* Hook to validate attribute((target("string"))). */
4599 static bool
4600 ix86_valid_target_attribute_p (tree fndecl,
4601 tree ARG_UNUSED (name),
4602 tree args,
4603 int ARG_UNUSED (flags))
4605 struct gcc_options func_options;
4606 tree new_target, new_optimize;
4607 bool ret = true;
4609 /* attribute((target("default"))) does nothing, beyond
4610 affecting multi-versioning. */
4611 if (TREE_VALUE (args)
4612 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4613 && TREE_CHAIN (args) == NULL_TREE
4614 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4615 return true;
4617 tree old_optimize = build_optimization_node (&global_options);
4619 /* Get the optimization options of the current function. */
4620 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4622 if (!func_optimize)
4623 func_optimize = old_optimize;
4625 /* Init func_options. */
4626 memset (&func_options, 0, sizeof (func_options));
4627 init_options_struct (&func_options, NULL);
4628 lang_hooks.init_options_struct (&func_options);
4630 cl_optimization_restore (&func_options,
4631 TREE_OPTIMIZATION (func_optimize));
4633 /* Initialize func_options to the default before its target options can
4634 be set. */
4635 cl_target_option_restore (&func_options,
4636 TREE_TARGET_OPTION (target_option_default_node));
4638 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4639 &global_options_set);
4641 new_optimize = build_optimization_node (&func_options);
4643 if (new_target == error_mark_node)
4644 ret = false;
4646 else if (fndecl && new_target)
4648 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4650 if (old_optimize != new_optimize)
4651 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4654 return ret;
4658 /* Hook to determine if one function can safely inline another. */
4660 static bool
4661 ix86_can_inline_p (tree caller, tree callee)
4663 bool ret = false;
4664 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4665 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4667 /* If callee has no option attributes, then it is ok to inline. */
4668 if (!callee_tree)
4669 ret = true;
4671 /* If caller has no option attributes, but callee does then it is not ok to
4672 inline. */
4673 else if (!caller_tree)
4674 ret = false;
4676 else
4678 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4679 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4681 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4682 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4683 function. */
4684 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4685 != callee_opts->x_ix86_isa_flags)
4686 ret = false;
4688 /* See if we have the same non-isa options. */
4689 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4690 ret = false;
4692 /* See if arch, tune, etc. are the same. */
4693 else if (caller_opts->arch != callee_opts->arch)
4694 ret = false;
4696 else if (caller_opts->tune != callee_opts->tune)
4697 ret = false;
4699 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4700 ret = false;
4702 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4703 ret = false;
4705 else
4706 ret = true;
4709 return ret;
4713 /* Remember the last target of ix86_set_current_function. */
4714 static GTY(()) tree ix86_previous_fndecl;
4716 /* Invalidate ix86_previous_fndecl cache. */
4717 void
4718 ix86_reset_previous_fndecl (void)
4720 ix86_previous_fndecl = NULL_TREE;
4723 /* Establish appropriate back-end context for processing the function
4724 FNDECL. The argument might be NULL to indicate processing at top
4725 level, outside of any function scope. */
4726 static void
4727 ix86_set_current_function (tree fndecl)
4729 /* Only change the context if the function changes. This hook is called
4730 several times in the course of compiling a function, and we don't want to
4731 slow things down too much or call target_reinit when it isn't safe. */
4732 if (fndecl && fndecl != ix86_previous_fndecl)
4734 tree old_tree = (ix86_previous_fndecl
4735 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4736 : NULL_TREE);
4738 tree new_tree = (fndecl
4739 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4740 : NULL_TREE);
4742 ix86_previous_fndecl = fndecl;
4743 if (old_tree == new_tree)
4746 else if (new_tree)
4748 cl_target_option_restore (&global_options,
4749 TREE_TARGET_OPTION (new_tree));
4750 target_reinit ();
4753 else if (old_tree)
4755 struct cl_target_option *def
4756 = TREE_TARGET_OPTION (target_option_current_node);
4758 cl_target_option_restore (&global_options, def);
4759 target_reinit ();
4765 /* Return true if this goes in large data/bss. */
4767 static bool
4768 ix86_in_large_data_p (tree exp)
4770 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4771 return false;
4773 /* Functions are never large data. */
4774 if (TREE_CODE (exp) == FUNCTION_DECL)
4775 return false;
4777 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4779 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4780 if (strcmp (section, ".ldata") == 0
4781 || strcmp (section, ".lbss") == 0)
4782 return true;
4783 return false;
4785 else
4787 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4789 /* If this is an incomplete type with size 0, then we can't put it
4790 in data because it might be too big when completed. */
4791 if (!size || size > ix86_section_threshold)
4792 return true;
4795 return false;
4798 /* Switch to the appropriate section for output of DECL.
4799 DECL is either a `VAR_DECL' node or a constant of some sort.
4800 RELOC indicates whether forming the initial value of DECL requires
4801 link-time relocations. */
4803 ATTRIBUTE_UNUSED static section *
4804 x86_64_elf_select_section (tree decl, int reloc,
4805 unsigned HOST_WIDE_INT align)
4807 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4808 && ix86_in_large_data_p (decl))
4810 const char *sname = NULL;
4811 unsigned int flags = SECTION_WRITE;
4812 switch (categorize_decl_for_section (decl, reloc))
4814 case SECCAT_DATA:
4815 sname = ".ldata";
4816 break;
4817 case SECCAT_DATA_REL:
4818 sname = ".ldata.rel";
4819 break;
4820 case SECCAT_DATA_REL_LOCAL:
4821 sname = ".ldata.rel.local";
4822 break;
4823 case SECCAT_DATA_REL_RO:
4824 sname = ".ldata.rel.ro";
4825 break;
4826 case SECCAT_DATA_REL_RO_LOCAL:
4827 sname = ".ldata.rel.ro.local";
4828 break;
4829 case SECCAT_BSS:
4830 sname = ".lbss";
4831 flags |= SECTION_BSS;
4832 break;
4833 case SECCAT_RODATA:
4834 case SECCAT_RODATA_MERGE_STR:
4835 case SECCAT_RODATA_MERGE_STR_INIT:
4836 case SECCAT_RODATA_MERGE_CONST:
4837 sname = ".lrodata";
4838 flags = 0;
4839 break;
4840 case SECCAT_SRODATA:
4841 case SECCAT_SDATA:
4842 case SECCAT_SBSS:
4843 gcc_unreachable ();
4844 case SECCAT_TEXT:
4845 case SECCAT_TDATA:
4846 case SECCAT_TBSS:
4847 /* We don't split these for medium model. Place them into
4848 default sections and hope for best. */
4849 break;
4851 if (sname)
4853 /* We might get called with string constants, but get_named_section
4854 doesn't like them as they are not DECLs. Also, we need to set
4855 flags in that case. */
4856 if (!DECL_P (decl))
4857 return get_section (sname, flags, NULL);
4858 return get_named_section (decl, sname, reloc);
4861 return default_elf_select_section (decl, reloc, align);
4864 /* Select a set of attributes for section NAME based on the properties
4865 of DECL and whether or not RELOC indicates that DECL's initializer
4866 might contain runtime relocations. */
4868 static unsigned int ATTRIBUTE_UNUSED
4869 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4871 unsigned int flags = default_section_type_flags (decl, name, reloc);
4873 if (decl == NULL_TREE
4874 && (strcmp (name, ".ldata.rel.ro") == 0
4875 || strcmp (name, ".ldata.rel.ro.local") == 0))
4876 flags |= SECTION_RELRO;
4878 if (strcmp (name, ".lbss") == 0
4879 || strncmp (name, ".lbss.", 5) == 0
4880 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4881 flags |= SECTION_BSS;
4883 return flags;
4886 /* Build up a unique section name, expressed as a
4887 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4888 RELOC indicates whether the initial value of EXP requires
4889 link-time relocations. */
4891 static void ATTRIBUTE_UNUSED
4892 x86_64_elf_unique_section (tree decl, int reloc)
4894 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4895 && ix86_in_large_data_p (decl))
4897 const char *prefix = NULL;
4898 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4899 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4901 switch (categorize_decl_for_section (decl, reloc))
4903 case SECCAT_DATA:
4904 case SECCAT_DATA_REL:
4905 case SECCAT_DATA_REL_LOCAL:
4906 case SECCAT_DATA_REL_RO:
4907 case SECCAT_DATA_REL_RO_LOCAL:
4908 prefix = one_only ? ".ld" : ".ldata";
4909 break;
4910 case SECCAT_BSS:
4911 prefix = one_only ? ".lb" : ".lbss";
4912 break;
4913 case SECCAT_RODATA:
4914 case SECCAT_RODATA_MERGE_STR:
4915 case SECCAT_RODATA_MERGE_STR_INIT:
4916 case SECCAT_RODATA_MERGE_CONST:
4917 prefix = one_only ? ".lr" : ".lrodata";
4918 break;
4919 case SECCAT_SRODATA:
4920 case SECCAT_SDATA:
4921 case SECCAT_SBSS:
4922 gcc_unreachable ();
4923 case SECCAT_TEXT:
4924 case SECCAT_TDATA:
4925 case SECCAT_TBSS:
4926 /* We don't split these for medium model. Place them into
4927 default sections and hope for best. */
4928 break;
4930 if (prefix)
4932 const char *name, *linkonce;
4933 char *string;
4935 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4936 name = targetm.strip_name_encoding (name);
4938 /* If we're using one_only, then there needs to be a .gnu.linkonce
4939 prefix to the section name. */
4940 linkonce = one_only ? ".gnu.linkonce" : "";
4942 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4944 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4945 return;
4948 default_unique_section (decl, reloc);
4951 #ifdef COMMON_ASM_OP
4952 /* This says how to output assembler code to declare an
4953 uninitialized external linkage data object.
4955 For medium model x86-64 we need to use .largecomm opcode for
4956 large objects. */
4957 void
4958 x86_elf_aligned_common (FILE *file,
4959 const char *name, unsigned HOST_WIDE_INT size,
4960 int align)
4962 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4963 && size > (unsigned int)ix86_section_threshold)
4964 fputs (".largecomm\t", file);
4965 else
4966 fputs (COMMON_ASM_OP, file);
4967 assemble_name (file, name);
4968 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4969 size, align / BITS_PER_UNIT);
4971 #endif
4973 /* Utility function for targets to use in implementing
4974 ASM_OUTPUT_ALIGNED_BSS. */
4976 void
4977 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4978 const char *name, unsigned HOST_WIDE_INT size,
4979 int align)
4981 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4982 && size > (unsigned int)ix86_section_threshold)
4983 switch_to_section (get_named_section (decl, ".lbss", 0));
4984 else
4985 switch_to_section (bss_section);
4986 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4987 #ifdef ASM_DECLARE_OBJECT_NAME
4988 last_assemble_variable_decl = decl;
4989 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4990 #else
4991 /* Standard thing is just output label for the object. */
4992 ASM_OUTPUT_LABEL (file, name);
4993 #endif /* ASM_DECLARE_OBJECT_NAME */
4994 ASM_OUTPUT_SKIP (file, size ? size : 1);
4997 /* Decide whether we must probe the stack before any space allocation
4998 on this target. It's essentially TARGET_STACK_PROBE except when
4999 -fstack-check causes the stack to be already probed differently. */
5001 bool
5002 ix86_target_stack_probe (void)
5004 /* Do not probe the stack twice if static stack checking is enabled. */
5005 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5006 return false;
5008 return TARGET_STACK_PROBE;
5011 /* Decide whether we can make a sibling call to a function. DECL is the
5012 declaration of the function being targeted by the call and EXP is the
5013 CALL_EXPR representing the call. */
5015 static bool
5016 ix86_function_ok_for_sibcall (tree decl, tree exp)
5018 tree type, decl_or_type;
5019 rtx a, b;
5021 /* If we are generating position-independent code, we cannot sibcall
5022 optimize any indirect call, or a direct call to a global function,
5023 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5024 if (!TARGET_MACHO
5025 && !TARGET_64BIT
5026 && flag_pic
5027 && (!decl || !targetm.binds_local_p (decl)))
5028 return false;
5030 /* If we need to align the outgoing stack, then sibcalling would
5031 unalign the stack, which may break the called function. */
5032 if (ix86_minimum_incoming_stack_boundary (true)
5033 < PREFERRED_STACK_BOUNDARY)
5034 return false;
5036 if (decl)
5038 decl_or_type = decl;
5039 type = TREE_TYPE (decl);
5041 else
5043 /* We're looking at the CALL_EXPR, we need the type of the function. */
5044 type = CALL_EXPR_FN (exp); /* pointer expression */
5045 type = TREE_TYPE (type); /* pointer type */
5046 type = TREE_TYPE (type); /* function type */
5047 decl_or_type = type;
5050 /* Check that the return value locations are the same. Like
5051 if we are returning floats on the 80387 register stack, we cannot
5052 make a sibcall from a function that doesn't return a float to a
5053 function that does or, conversely, from a function that does return
5054 a float to a function that doesn't; the necessary stack adjustment
5055 would not be executed. This is also the place we notice
5056 differences in the return value ABI. Note that it is ok for one
5057 of the functions to have void return type as long as the return
5058 value of the other is passed in a register. */
5059 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5060 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5061 cfun->decl, false);
5062 if (STACK_REG_P (a) || STACK_REG_P (b))
5064 if (!rtx_equal_p (a, b))
5065 return false;
5067 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5069 else if (!rtx_equal_p (a, b))
5070 return false;
5072 if (TARGET_64BIT)
5074 /* The SYSV ABI has more call-clobbered registers;
5075 disallow sibcalls from MS to SYSV. */
5076 if (cfun->machine->call_abi == MS_ABI
5077 && ix86_function_type_abi (type) == SYSV_ABI)
5078 return false;
5080 else
5082 /* If this call is indirect, we'll need to be able to use a
5083 call-clobbered register for the address of the target function.
5084 Make sure that all such registers are not used for passing
5085 parameters. Note that DLLIMPORT functions are indirect. */
5086 if (!decl
5087 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5089 if (ix86_function_regparm (type, NULL) >= 3)
5091 /* ??? Need to count the actual number of registers to be used,
5092 not the possible number of registers. Fix later. */
5093 return false;
5098 /* Otherwise okay. That also includes certain types of indirect calls. */
5099 return true;
5102 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5103 and "sseregparm" calling convention attributes;
5104 arguments as in struct attribute_spec.handler. */
5106 static tree
5107 ix86_handle_cconv_attribute (tree *node, tree name,
5108 tree args,
5109 int flags ATTRIBUTE_UNUSED,
5110 bool *no_add_attrs)
5112 if (TREE_CODE (*node) != FUNCTION_TYPE
5113 && TREE_CODE (*node) != METHOD_TYPE
5114 && TREE_CODE (*node) != FIELD_DECL
5115 && TREE_CODE (*node) != TYPE_DECL)
5117 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5118 name);
5119 *no_add_attrs = true;
5120 return NULL_TREE;
5123 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5124 if (is_attribute_p ("regparm", name))
5126 tree cst;
5128 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5130 error ("fastcall and regparm attributes are not compatible");
5133 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5135 error ("regparam and thiscall attributes are not compatible");
5138 cst = TREE_VALUE (args);
5139 if (TREE_CODE (cst) != INTEGER_CST)
5141 warning (OPT_Wattributes,
5142 "%qE attribute requires an integer constant argument",
5143 name);
5144 *no_add_attrs = true;
5146 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5148 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5149 name, REGPARM_MAX);
5150 *no_add_attrs = true;
5153 return NULL_TREE;
5156 if (TARGET_64BIT)
5158 /* Do not warn when emulating the MS ABI. */
5159 if ((TREE_CODE (*node) != FUNCTION_TYPE
5160 && TREE_CODE (*node) != METHOD_TYPE)
5161 || ix86_function_type_abi (*node) != MS_ABI)
5162 warning (OPT_Wattributes, "%qE attribute ignored",
5163 name);
5164 *no_add_attrs = true;
5165 return NULL_TREE;
5168 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5169 if (is_attribute_p ("fastcall", name))
5171 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5173 error ("fastcall and cdecl attributes are not compatible");
5175 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5177 error ("fastcall and stdcall attributes are not compatible");
5179 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5181 error ("fastcall and regparm attributes are not compatible");
5183 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5185 error ("fastcall and thiscall attributes are not compatible");
5189 /* Can combine stdcall with fastcall (redundant), regparm and
5190 sseregparm. */
5191 else if (is_attribute_p ("stdcall", name))
5193 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5195 error ("stdcall and cdecl attributes are not compatible");
5197 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5199 error ("stdcall and fastcall attributes are not compatible");
5201 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5203 error ("stdcall and thiscall attributes are not compatible");
5207 /* Can combine cdecl with regparm and sseregparm. */
5208 else if (is_attribute_p ("cdecl", name))
5210 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5212 error ("stdcall and cdecl attributes are not compatible");
5214 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5216 error ("fastcall and cdecl attributes are not compatible");
5218 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5220 error ("cdecl and thiscall attributes are not compatible");
5223 else if (is_attribute_p ("thiscall", name))
5225 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5226 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5227 name);
5228 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5230 error ("stdcall and thiscall attributes are not compatible");
5232 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5234 error ("fastcall and thiscall attributes are not compatible");
5236 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5238 error ("cdecl and thiscall attributes are not compatible");
5242 /* Can combine sseregparm with all attributes. */
5244 return NULL_TREE;
5247 /* The transactional memory builtins are implicitly regparm or fastcall
5248 depending on the ABI. Override the generic do-nothing attribute that
5249 these builtins were declared with, and replace it with one of the two
5250 attributes that we expect elsewhere. */
5252 static tree
5253 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5254 tree args ATTRIBUTE_UNUSED,
5255 int flags, bool *no_add_attrs)
5257 tree alt;
5259 /* In no case do we want to add the placeholder attribute. */
5260 *no_add_attrs = true;
5262 /* The 64-bit ABI is unchanged for transactional memory. */
5263 if (TARGET_64BIT)
5264 return NULL_TREE;
5266 /* ??? Is there a better way to validate 32-bit windows? We have
5267 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5268 if (CHECK_STACK_LIMIT > 0)
5269 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5270 else
5272 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5273 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5275 decl_attributes (node, alt, flags);
5277 return NULL_TREE;
5280 /* This function determines from TYPE the calling-convention. */
5282 unsigned int
5283 ix86_get_callcvt (const_tree type)
5285 unsigned int ret = 0;
5286 bool is_stdarg;
5287 tree attrs;
5289 if (TARGET_64BIT)
5290 return IX86_CALLCVT_CDECL;
5292 attrs = TYPE_ATTRIBUTES (type);
5293 if (attrs != NULL_TREE)
5295 if (lookup_attribute ("cdecl", attrs))
5296 ret |= IX86_CALLCVT_CDECL;
5297 else if (lookup_attribute ("stdcall", attrs))
5298 ret |= IX86_CALLCVT_STDCALL;
5299 else if (lookup_attribute ("fastcall", attrs))
5300 ret |= IX86_CALLCVT_FASTCALL;
5301 else if (lookup_attribute ("thiscall", attrs))
5302 ret |= IX86_CALLCVT_THISCALL;
5304 /* Regparam isn't allowed for thiscall and fastcall. */
5305 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5307 if (lookup_attribute ("regparm", attrs))
5308 ret |= IX86_CALLCVT_REGPARM;
5309 if (lookup_attribute ("sseregparm", attrs))
5310 ret |= IX86_CALLCVT_SSEREGPARM;
5313 if (IX86_BASE_CALLCVT(ret) != 0)
5314 return ret;
5317 is_stdarg = stdarg_p (type);
5318 if (TARGET_RTD && !is_stdarg)
5319 return IX86_CALLCVT_STDCALL | ret;
5321 if (ret != 0
5322 || is_stdarg
5323 || TREE_CODE (type) != METHOD_TYPE
5324 || ix86_function_type_abi (type) != MS_ABI)
5325 return IX86_CALLCVT_CDECL | ret;
5327 return IX86_CALLCVT_THISCALL;
5330 /* Return 0 if the attributes for two types are incompatible, 1 if they
5331 are compatible, and 2 if they are nearly compatible (which causes a
5332 warning to be generated). */
5334 static int
5335 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5337 unsigned int ccvt1, ccvt2;
5339 if (TREE_CODE (type1) != FUNCTION_TYPE
5340 && TREE_CODE (type1) != METHOD_TYPE)
5341 return 1;
5343 ccvt1 = ix86_get_callcvt (type1);
5344 ccvt2 = ix86_get_callcvt (type2);
5345 if (ccvt1 != ccvt2)
5346 return 0;
5347 if (ix86_function_regparm (type1, NULL)
5348 != ix86_function_regparm (type2, NULL))
5349 return 0;
5351 return 1;
5354 /* Return the regparm value for a function with the indicated TYPE and DECL.
5355 DECL may be NULL when calling function indirectly
5356 or considering a libcall. */
5358 static int
5359 ix86_function_regparm (const_tree type, const_tree decl)
5361 tree attr;
5362 int regparm;
5363 unsigned int ccvt;
5365 if (TARGET_64BIT)
5366 return (ix86_function_type_abi (type) == SYSV_ABI
5367 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5368 ccvt = ix86_get_callcvt (type);
5369 regparm = ix86_regparm;
5371 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5373 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5374 if (attr)
5376 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5377 return regparm;
5380 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5381 return 2;
5382 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5383 return 1;
5385 /* Use register calling convention for local functions when possible. */
5386 if (decl
5387 && TREE_CODE (decl) == FUNCTION_DECL
5388 && optimize
5389 && !(profile_flag && !flag_fentry))
5391 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5392 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5393 if (i && i->local && i->can_change_signature)
5395 int local_regparm, globals = 0, regno;
5397 /* Make sure no regparm register is taken by a
5398 fixed register variable. */
5399 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5400 if (fixed_regs[local_regparm])
5401 break;
5403 /* We don't want to use regparm(3) for nested functions as
5404 these use a static chain pointer in the third argument. */
5405 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5406 local_regparm = 2;
5408 /* In 32-bit mode save a register for the split stack. */
5409 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5410 local_regparm = 2;
5412 /* Each fixed register usage increases register pressure,
5413 so less registers should be used for argument passing.
5414 This functionality can be overriden by an explicit
5415 regparm value. */
5416 for (regno = AX_REG; regno <= DI_REG; regno++)
5417 if (fixed_regs[regno])
5418 globals++;
5420 local_regparm
5421 = globals < local_regparm ? local_regparm - globals : 0;
5423 if (local_regparm > regparm)
5424 regparm = local_regparm;
5428 return regparm;
5431 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5432 DFmode (2) arguments in SSE registers for a function with the
5433 indicated TYPE and DECL. DECL may be NULL when calling function
5434 indirectly or considering a libcall. Otherwise return 0. */
5436 static int
5437 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5439 gcc_assert (!TARGET_64BIT);
5441 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5442 by the sseregparm attribute. */
5443 if (TARGET_SSEREGPARM
5444 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5446 if (!TARGET_SSE)
5448 if (warn)
5450 if (decl)
5451 error ("calling %qD with attribute sseregparm without "
5452 "SSE/SSE2 enabled", decl);
5453 else
5454 error ("calling %qT with attribute sseregparm without "
5455 "SSE/SSE2 enabled", type);
5457 return 0;
5460 return 2;
5463 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5464 (and DFmode for SSE2) arguments in SSE registers. */
5465 if (decl && TARGET_SSE_MATH && optimize
5466 && !(profile_flag && !flag_fentry))
5468 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5469 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5470 if (i && i->local && i->can_change_signature)
5471 return TARGET_SSE2 ? 2 : 1;
5474 return 0;
5477 /* Return true if EAX is live at the start of the function. Used by
5478 ix86_expand_prologue to determine if we need special help before
5479 calling allocate_stack_worker. */
5481 static bool
5482 ix86_eax_live_at_start_p (void)
5484 /* Cheat. Don't bother working forward from ix86_function_regparm
5485 to the function type to whether an actual argument is located in
5486 eax. Instead just look at cfg info, which is still close enough
5487 to correct at this point. This gives false positives for broken
5488 functions that might use uninitialized data that happens to be
5489 allocated in eax, but who cares? */
5490 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5493 static bool
5494 ix86_keep_aggregate_return_pointer (tree fntype)
5496 tree attr;
5498 if (!TARGET_64BIT)
5500 attr = lookup_attribute ("callee_pop_aggregate_return",
5501 TYPE_ATTRIBUTES (fntype));
5502 if (attr)
5503 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5505 /* For 32-bit MS-ABI the default is to keep aggregate
5506 return pointer. */
5507 if (ix86_function_type_abi (fntype) == MS_ABI)
5508 return true;
5510 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5513 /* Value is the number of bytes of arguments automatically
5514 popped when returning from a subroutine call.
5515 FUNDECL is the declaration node of the function (as a tree),
5516 FUNTYPE is the data type of the function (as a tree),
5517 or for a library call it is an identifier node for the subroutine name.
5518 SIZE is the number of bytes of arguments passed on the stack.
5520 On the 80386, the RTD insn may be used to pop them if the number
5521 of args is fixed, but if the number is variable then the caller
5522 must pop them all. RTD can't be used for library calls now
5523 because the library is compiled with the Unix compiler.
5524 Use of RTD is a selectable option, since it is incompatible with
5525 standard Unix calling sequences. If the option is not selected,
5526 the caller must always pop the args.
5528 The attribute stdcall is equivalent to RTD on a per module basis. */
5530 static int
5531 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5533 unsigned int ccvt;
5535 /* None of the 64-bit ABIs pop arguments. */
5536 if (TARGET_64BIT)
5537 return 0;
5539 ccvt = ix86_get_callcvt (funtype);
5541 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5542 | IX86_CALLCVT_THISCALL)) != 0
5543 && ! stdarg_p (funtype))
5544 return size;
5546 /* Lose any fake structure return argument if it is passed on the stack. */
5547 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5548 && !ix86_keep_aggregate_return_pointer (funtype))
5550 int nregs = ix86_function_regparm (funtype, fundecl);
5551 if (nregs == 0)
5552 return GET_MODE_SIZE (Pmode);
5555 return 0;
5558 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5560 static bool
5561 ix86_legitimate_combined_insn (rtx insn)
5563 /* Check operand constraints in case hard registers were propagated
5564 into insn pattern. This check prevents combine pass from
5565 generating insn patterns with invalid hard register operands.
5566 These invalid insns can eventually confuse reload to error out
5567 with a spill failure. See also PRs 46829 and 46843. */
5568 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5570 int i;
5572 extract_insn (insn);
5573 preprocess_constraints ();
5575 for (i = 0; i < recog_data.n_operands; i++)
5577 rtx op = recog_data.operand[i];
5578 enum machine_mode mode = GET_MODE (op);
5579 struct operand_alternative *op_alt;
5580 int offset = 0;
5581 bool win;
5582 int j;
5584 /* A unary operator may be accepted by the predicate, but it
5585 is irrelevant for matching constraints. */
5586 if (UNARY_P (op))
5587 op = XEXP (op, 0);
5589 if (GET_CODE (op) == SUBREG)
5591 if (REG_P (SUBREG_REG (op))
5592 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5593 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5594 GET_MODE (SUBREG_REG (op)),
5595 SUBREG_BYTE (op),
5596 GET_MODE (op));
5597 op = SUBREG_REG (op);
5600 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5601 continue;
5603 op_alt = recog_op_alt[i];
5605 /* Operand has no constraints, anything is OK. */
5606 win = !recog_data.n_alternatives;
5608 for (j = 0; j < recog_data.n_alternatives; j++)
5610 if (op_alt[j].anything_ok
5611 || (op_alt[j].matches != -1
5612 && operands_match_p
5613 (recog_data.operand[i],
5614 recog_data.operand[op_alt[j].matches]))
5615 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5617 win = true;
5618 break;
5622 if (!win)
5623 return false;
5627 return true;
5630 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5632 static unsigned HOST_WIDE_INT
5633 ix86_asan_shadow_offset (void)
5635 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5636 : HOST_WIDE_INT_C (0x7fff8000))
5637 : (HOST_WIDE_INT_1 << 29);
5640 /* Argument support functions. */
5642 /* Return true when register may be used to pass function parameters. */
5643 bool
5644 ix86_function_arg_regno_p (int regno)
5646 int i;
5647 const int *parm_regs;
5649 if (!TARGET_64BIT)
5651 if (TARGET_MACHO)
5652 return (regno < REGPARM_MAX
5653 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5654 else
5655 return (regno < REGPARM_MAX
5656 || (TARGET_MMX && MMX_REGNO_P (regno)
5657 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5658 || (TARGET_SSE && SSE_REGNO_P (regno)
5659 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5662 if (TARGET_SSE && SSE_REGNO_P (regno)
5663 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5664 return true;
5666 /* TODO: The function should depend on current function ABI but
5667 builtins.c would need updating then. Therefore we use the
5668 default ABI. */
5670 /* RAX is used as hidden argument to va_arg functions. */
5671 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5672 return true;
5674 if (ix86_abi == MS_ABI)
5675 parm_regs = x86_64_ms_abi_int_parameter_registers;
5676 else
5677 parm_regs = x86_64_int_parameter_registers;
5678 for (i = 0; i < (ix86_abi == MS_ABI
5679 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5680 if (regno == parm_regs[i])
5681 return true;
5682 return false;
5685 /* Return if we do not know how to pass TYPE solely in registers. */
5687 static bool
5688 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5690 if (must_pass_in_stack_var_size_or_pad (mode, type))
5691 return true;
5693 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5694 The layout_type routine is crafty and tries to trick us into passing
5695 currently unsupported vector types on the stack by using TImode. */
5696 return (!TARGET_64BIT && mode == TImode
5697 && type && TREE_CODE (type) != VECTOR_TYPE);
5700 /* It returns the size, in bytes, of the area reserved for arguments passed
5701 in registers for the function represented by fndecl dependent to the used
5702 abi format. */
5704 ix86_reg_parm_stack_space (const_tree fndecl)
5706 enum calling_abi call_abi = SYSV_ABI;
5707 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5708 call_abi = ix86_function_abi (fndecl);
5709 else
5710 call_abi = ix86_function_type_abi (fndecl);
5711 if (TARGET_64BIT && call_abi == MS_ABI)
5712 return 32;
5713 return 0;
5716 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5717 call abi used. */
5718 enum calling_abi
5719 ix86_function_type_abi (const_tree fntype)
5721 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5723 enum calling_abi abi = ix86_abi;
5724 if (abi == SYSV_ABI)
5726 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5727 abi = MS_ABI;
5729 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5730 abi = SYSV_ABI;
5731 return abi;
5733 return ix86_abi;
5736 /* We add this as a workaround in order to use libc_has_function
5737 hook in i386.md. */
5738 bool
5739 ix86_libc_has_function (enum function_class fn_class)
5741 return targetm.libc_has_function (fn_class);
5744 static bool
5745 ix86_function_ms_hook_prologue (const_tree fn)
5747 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5749 if (decl_function_context (fn) != NULL_TREE)
5750 error_at (DECL_SOURCE_LOCATION (fn),
5751 "ms_hook_prologue is not compatible with nested function");
5752 else
5753 return true;
5755 return false;
5758 static enum calling_abi
5759 ix86_function_abi (const_tree fndecl)
5761 if (! fndecl)
5762 return ix86_abi;
5763 return ix86_function_type_abi (TREE_TYPE (fndecl));
5766 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5767 call abi used. */
5768 enum calling_abi
5769 ix86_cfun_abi (void)
5771 if (! cfun)
5772 return ix86_abi;
5773 return cfun->machine->call_abi;
5776 /* Write the extra assembler code needed to declare a function properly. */
5778 void
5779 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5780 tree decl)
5782 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5784 if (is_ms_hook)
5786 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5787 unsigned int filler_cc = 0xcccccccc;
5789 for (i = 0; i < filler_count; i += 4)
5790 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5793 #ifdef SUBTARGET_ASM_UNWIND_INIT
5794 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5795 #endif
5797 ASM_OUTPUT_LABEL (asm_out_file, fname);
5799 /* Output magic byte marker, if hot-patch attribute is set. */
5800 if (is_ms_hook)
5802 if (TARGET_64BIT)
5804 /* leaq [%rsp + 0], %rsp */
5805 asm_fprintf (asm_out_file, ASM_BYTE
5806 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5808 else
5810 /* movl.s %edi, %edi
5811 push %ebp
5812 movl.s %esp, %ebp */
5813 asm_fprintf (asm_out_file, ASM_BYTE
5814 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5819 /* regclass.c */
5820 extern void init_regs (void);
5822 /* Implementation of call abi switching target hook. Specific to FNDECL
5823 the specific call register sets are set. See also
5824 ix86_conditional_register_usage for more details. */
5825 void
5826 ix86_call_abi_override (const_tree fndecl)
5828 if (fndecl == NULL_TREE)
5829 cfun->machine->call_abi = ix86_abi;
5830 else
5831 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5834 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5835 expensive re-initialization of init_regs each time we switch function context
5836 since this is needed only during RTL expansion. */
5837 static void
5838 ix86_maybe_switch_abi (void)
5840 if (TARGET_64BIT &&
5841 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5842 reinit_regs ();
5845 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5846 for a call to a function whose data type is FNTYPE.
5847 For a library call, FNTYPE is 0. */
5849 void
5850 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5851 tree fntype, /* tree ptr for function decl */
5852 rtx libname, /* SYMBOL_REF of library name or 0 */
5853 tree fndecl,
5854 int caller)
5856 struct cgraph_local_info *i;
5858 memset (cum, 0, sizeof (*cum));
5860 if (fndecl)
5862 i = cgraph_local_info (fndecl);
5863 cum->call_abi = ix86_function_abi (fndecl);
5865 else
5867 i = NULL;
5868 cum->call_abi = ix86_function_type_abi (fntype);
5871 cum->caller = caller;
5873 /* Set up the number of registers to use for passing arguments. */
5875 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5876 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5877 "or subtarget optimization implying it");
5878 cum->nregs = ix86_regparm;
5879 if (TARGET_64BIT)
5881 cum->nregs = (cum->call_abi == SYSV_ABI
5882 ? X86_64_REGPARM_MAX
5883 : X86_64_MS_REGPARM_MAX);
5885 if (TARGET_SSE)
5887 cum->sse_nregs = SSE_REGPARM_MAX;
5888 if (TARGET_64BIT)
5890 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5891 ? X86_64_SSE_REGPARM_MAX
5892 : X86_64_MS_SSE_REGPARM_MAX);
5895 if (TARGET_MMX)
5896 cum->mmx_nregs = MMX_REGPARM_MAX;
5897 cum->warn_avx = true;
5898 cum->warn_sse = true;
5899 cum->warn_mmx = true;
5901 /* Because type might mismatch in between caller and callee, we need to
5902 use actual type of function for local calls.
5903 FIXME: cgraph_analyze can be told to actually record if function uses
5904 va_start so for local functions maybe_vaarg can be made aggressive
5905 helping K&R code.
5906 FIXME: once typesytem is fixed, we won't need this code anymore. */
5907 if (i && i->local && i->can_change_signature)
5908 fntype = TREE_TYPE (fndecl);
5909 cum->maybe_vaarg = (fntype
5910 ? (!prototype_p (fntype) || stdarg_p (fntype))
5911 : !libname);
5913 if (!TARGET_64BIT)
5915 /* If there are variable arguments, then we won't pass anything
5916 in registers in 32-bit mode. */
5917 if (stdarg_p (fntype))
5919 cum->nregs = 0;
5920 cum->sse_nregs = 0;
5921 cum->mmx_nregs = 0;
5922 cum->warn_avx = 0;
5923 cum->warn_sse = 0;
5924 cum->warn_mmx = 0;
5925 return;
5928 /* Use ecx and edx registers if function has fastcall attribute,
5929 else look for regparm information. */
5930 if (fntype)
5932 unsigned int ccvt = ix86_get_callcvt (fntype);
5933 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5935 cum->nregs = 1;
5936 cum->fastcall = 1; /* Same first register as in fastcall. */
5938 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5940 cum->nregs = 2;
5941 cum->fastcall = 1;
5943 else
5944 cum->nregs = ix86_function_regparm (fntype, fndecl);
5947 /* Set up the number of SSE registers used for passing SFmode
5948 and DFmode arguments. Warn for mismatching ABI. */
5949 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5953 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5954 But in the case of vector types, it is some vector mode.
5956 When we have only some of our vector isa extensions enabled, then there
5957 are some modes for which vector_mode_supported_p is false. For these
5958 modes, the generic vector support in gcc will choose some non-vector mode
5959 in order to implement the type. By computing the natural mode, we'll
5960 select the proper ABI location for the operand and not depend on whatever
5961 the middle-end decides to do with these vector types.
5963 The midde-end can't deal with the vector types > 16 bytes. In this
5964 case, we return the original mode and warn ABI change if CUM isn't
5965 NULL. */
5967 static enum machine_mode
5968 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5970 enum machine_mode mode = TYPE_MODE (type);
5972 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5974 HOST_WIDE_INT size = int_size_in_bytes (type);
5975 if ((size == 8 || size == 16 || size == 32)
5976 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5977 && TYPE_VECTOR_SUBPARTS (type) > 1)
5979 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5981 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5982 mode = MIN_MODE_VECTOR_FLOAT;
5983 else
5984 mode = MIN_MODE_VECTOR_INT;
5986 /* Get the mode which has this inner mode and number of units. */
5987 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5988 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5989 && GET_MODE_INNER (mode) == innermode)
5991 if (size == 32 && !TARGET_AVX)
5993 static bool warnedavx;
5995 if (cum
5996 && !warnedavx
5997 && cum->warn_avx)
5999 warnedavx = true;
6000 warning (0, "AVX vector argument without AVX "
6001 "enabled changes the ABI");
6003 return TYPE_MODE (type);
6005 else if ((size == 8 || size == 16) && !TARGET_SSE)
6007 static bool warnedsse;
6009 if (cum
6010 && !warnedsse
6011 && cum->warn_sse)
6013 warnedsse = true;
6014 warning (0, "SSE vector argument without SSE "
6015 "enabled changes the ABI");
6017 return mode;
6019 else
6020 return mode;
6023 gcc_unreachable ();
6027 return mode;
6030 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6031 this may not agree with the mode that the type system has chosen for the
6032 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6033 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6035 static rtx
6036 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6037 unsigned int regno)
6039 rtx tmp;
6041 if (orig_mode != BLKmode)
6042 tmp = gen_rtx_REG (orig_mode, regno);
6043 else
6045 tmp = gen_rtx_REG (mode, regno);
6046 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6047 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6050 return tmp;
6053 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6054 of this code is to classify each 8bytes of incoming argument by the register
6055 class and assign registers accordingly. */
6057 /* Return the union class of CLASS1 and CLASS2.
6058 See the x86-64 PS ABI for details. */
6060 static enum x86_64_reg_class
6061 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6063 /* Rule #1: If both classes are equal, this is the resulting class. */
6064 if (class1 == class2)
6065 return class1;
6067 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6068 the other class. */
6069 if (class1 == X86_64_NO_CLASS)
6070 return class2;
6071 if (class2 == X86_64_NO_CLASS)
6072 return class1;
6074 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6075 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6076 return X86_64_MEMORY_CLASS;
6078 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6079 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6080 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6081 return X86_64_INTEGERSI_CLASS;
6082 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6083 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6084 return X86_64_INTEGER_CLASS;
6086 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6087 MEMORY is used. */
6088 if (class1 == X86_64_X87_CLASS
6089 || class1 == X86_64_X87UP_CLASS
6090 || class1 == X86_64_COMPLEX_X87_CLASS
6091 || class2 == X86_64_X87_CLASS
6092 || class2 == X86_64_X87UP_CLASS
6093 || class2 == X86_64_COMPLEX_X87_CLASS)
6094 return X86_64_MEMORY_CLASS;
6096 /* Rule #6: Otherwise class SSE is used. */
6097 return X86_64_SSE_CLASS;
6100 /* Classify the argument of type TYPE and mode MODE.
6101 CLASSES will be filled by the register class used to pass each word
6102 of the operand. The number of words is returned. In case the parameter
6103 should be passed in memory, 0 is returned. As a special case for zero
6104 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6106 BIT_OFFSET is used internally for handling records and specifies offset
6107 of the offset in bits modulo 256 to avoid overflow cases.
6109 See the x86-64 PS ABI for details.
6112 static int
6113 classify_argument (enum machine_mode mode, const_tree type,
6114 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6116 HOST_WIDE_INT bytes =
6117 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6118 int words
6119 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6121 /* Variable sized entities are always passed/returned in memory. */
6122 if (bytes < 0)
6123 return 0;
6125 if (mode != VOIDmode
6126 && targetm.calls.must_pass_in_stack (mode, type))
6127 return 0;
6129 /* Special case check for pointer to shared, on 64-bit target. */
6130 if (TARGET_64BIT && mode == TImode
6131 && type && TREE_CODE (type) == POINTER_TYPE
6132 && upc_shared_type_p (TREE_TYPE (type)))
6134 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6135 return 2;
6138 if (type && AGGREGATE_TYPE_P (type))
6140 int i;
6141 tree field;
6142 enum x86_64_reg_class subclasses[MAX_CLASSES];
6144 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6145 if (bytes > 32)
6146 return 0;
6148 for (i = 0; i < words; i++)
6149 classes[i] = X86_64_NO_CLASS;
6151 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6152 signalize memory class, so handle it as special case. */
6153 if (!words)
6155 classes[0] = X86_64_NO_CLASS;
6156 return 1;
6159 /* Classify each field of record and merge classes. */
6160 switch (TREE_CODE (type))
6162 case RECORD_TYPE:
6163 /* And now merge the fields of structure. */
6164 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6166 if (TREE_CODE (field) == FIELD_DECL)
6168 int num;
6170 if (TREE_TYPE (field) == error_mark_node)
6171 continue;
6173 /* Bitfields are always classified as integer. Handle them
6174 early, since later code would consider them to be
6175 misaligned integers. */
6176 if (DECL_BIT_FIELD (field))
6178 for (i = (int_bit_position (field)
6179 + (bit_offset % 64)) / 8 / 8;
6180 i < ((int_bit_position (field) + (bit_offset % 64))
6181 + tree_low_cst (DECL_SIZE (field), 0)
6182 + 63) / 8 / 8; i++)
6183 classes[i] =
6184 merge_classes (X86_64_INTEGER_CLASS,
6185 classes[i]);
6187 else
6189 int pos;
6191 type = TREE_TYPE (field);
6193 /* Flexible array member is ignored. */
6194 if (TYPE_MODE (type) == BLKmode
6195 && TREE_CODE (type) == ARRAY_TYPE
6196 && TYPE_SIZE (type) == NULL_TREE
6197 && TYPE_DOMAIN (type) != NULL_TREE
6198 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6199 == NULL_TREE))
6201 static bool warned;
6203 if (!warned && warn_psabi)
6205 warned = true;
6206 inform (input_location,
6207 "the ABI of passing struct with"
6208 " a flexible array member has"
6209 " changed in GCC 4.4");
6211 continue;
6213 num = classify_argument (TYPE_MODE (type), type,
6214 subclasses,
6215 (int_bit_position (field)
6216 + bit_offset) % 256);
6217 if (!num)
6218 return 0;
6219 pos = (int_bit_position (field)
6220 + (bit_offset % 64)) / 8 / 8;
6221 for (i = 0; i < num && (i + pos) < words; i++)
6222 classes[i + pos] =
6223 merge_classes (subclasses[i], classes[i + pos]);
6227 break;
6229 case ARRAY_TYPE:
6230 /* Arrays are handled as small records. */
6232 int num;
6233 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6234 TREE_TYPE (type), subclasses, bit_offset);
6235 if (!num)
6236 return 0;
6238 /* The partial classes are now full classes. */
6239 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6240 subclasses[0] = X86_64_SSE_CLASS;
6241 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6242 && !((bit_offset % 64) == 0 && bytes == 4))
6243 subclasses[0] = X86_64_INTEGER_CLASS;
6245 for (i = 0; i < words; i++)
6246 classes[i] = subclasses[i % num];
6248 break;
6250 case UNION_TYPE:
6251 case QUAL_UNION_TYPE:
6252 /* Unions are similar to RECORD_TYPE but offset is always 0.
6254 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6256 if (TREE_CODE (field) == FIELD_DECL)
6258 int num;
6260 if (TREE_TYPE (field) == error_mark_node)
6261 continue;
6263 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6264 TREE_TYPE (field), subclasses,
6265 bit_offset);
6266 if (!num)
6267 return 0;
6268 for (i = 0; i < num; i++)
6269 classes[i] = merge_classes (subclasses[i], classes[i]);
6272 break;
6274 default:
6275 gcc_unreachable ();
6278 if (words > 2)
6280 /* When size > 16 bytes, if the first one isn't
6281 X86_64_SSE_CLASS or any other ones aren't
6282 X86_64_SSEUP_CLASS, everything should be passed in
6283 memory. */
6284 if (classes[0] != X86_64_SSE_CLASS)
6285 return 0;
6287 for (i = 1; i < words; i++)
6288 if (classes[i] != X86_64_SSEUP_CLASS)
6289 return 0;
6292 /* Final merger cleanup. */
6293 for (i = 0; i < words; i++)
6295 /* If one class is MEMORY, everything should be passed in
6296 memory. */
6297 if (classes[i] == X86_64_MEMORY_CLASS)
6298 return 0;
6300 /* The X86_64_SSEUP_CLASS should be always preceded by
6301 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6302 if (classes[i] == X86_64_SSEUP_CLASS
6303 && classes[i - 1] != X86_64_SSE_CLASS
6304 && classes[i - 1] != X86_64_SSEUP_CLASS)
6306 /* The first one should never be X86_64_SSEUP_CLASS. */
6307 gcc_assert (i != 0);
6308 classes[i] = X86_64_SSE_CLASS;
6311 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6312 everything should be passed in memory. */
6313 if (classes[i] == X86_64_X87UP_CLASS
6314 && (classes[i - 1] != X86_64_X87_CLASS))
6316 static bool warned;
6318 /* The first one should never be X86_64_X87UP_CLASS. */
6319 gcc_assert (i != 0);
6320 if (!warned && warn_psabi)
6322 warned = true;
6323 inform (input_location,
6324 "the ABI of passing union with long double"
6325 " has changed in GCC 4.4");
6327 return 0;
6330 return words;
6333 /* Compute alignment needed. We align all types to natural boundaries with
6334 exception of XFmode that is aligned to 64bits. */
6335 if (mode != VOIDmode && mode != BLKmode)
6337 int mode_alignment = GET_MODE_BITSIZE (mode);
6339 if (mode == XFmode)
6340 mode_alignment = 128;
6341 else if (mode == XCmode)
6342 mode_alignment = 256;
6343 if (COMPLEX_MODE_P (mode))
6344 mode_alignment /= 2;
6345 /* Misaligned fields are always returned in memory. */
6346 if (bit_offset % mode_alignment)
6347 return 0;
6350 /* for V1xx modes, just use the base mode */
6351 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6352 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6353 mode = GET_MODE_INNER (mode);
6355 /* Classification of atomic types. */
6356 switch (mode)
6358 case SDmode:
6359 case DDmode:
6360 classes[0] = X86_64_SSE_CLASS;
6361 return 1;
6362 case TDmode:
6363 classes[0] = X86_64_SSE_CLASS;
6364 classes[1] = X86_64_SSEUP_CLASS;
6365 return 2;
6366 case DImode:
6367 case SImode:
6368 case HImode:
6369 case QImode:
6370 case CSImode:
6371 case CHImode:
6372 case CQImode:
6374 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6376 if (size <= 32)
6378 classes[0] = X86_64_INTEGERSI_CLASS;
6379 return 1;
6381 else if (size <= 64)
6383 classes[0] = X86_64_INTEGER_CLASS;
6384 return 1;
6386 else if (size <= 64+32)
6388 classes[0] = X86_64_INTEGER_CLASS;
6389 classes[1] = X86_64_INTEGERSI_CLASS;
6390 return 2;
6392 else if (size <= 64+64)
6394 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6395 return 2;
6397 else
6398 gcc_unreachable ();
6400 case CDImode:
6401 case TImode:
6402 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6403 return 2;
6404 case COImode:
6405 case OImode:
6406 /* OImode shouldn't be used directly. */
6407 gcc_unreachable ();
6408 case CTImode:
6409 return 0;
6410 case SFmode:
6411 if (!(bit_offset % 64))
6412 classes[0] = X86_64_SSESF_CLASS;
6413 else
6414 classes[0] = X86_64_SSE_CLASS;
6415 return 1;
6416 case DFmode:
6417 classes[0] = X86_64_SSEDF_CLASS;
6418 return 1;
6419 case XFmode:
6420 classes[0] = X86_64_X87_CLASS;
6421 classes[1] = X86_64_X87UP_CLASS;
6422 return 2;
6423 case TFmode:
6424 classes[0] = X86_64_SSE_CLASS;
6425 classes[1] = X86_64_SSEUP_CLASS;
6426 return 2;
6427 case SCmode:
6428 classes[0] = X86_64_SSE_CLASS;
6429 if (!(bit_offset % 64))
6430 return 1;
6431 else
6433 static bool warned;
6435 if (!warned && warn_psabi)
6437 warned = true;
6438 inform (input_location,
6439 "the ABI of passing structure with complex float"
6440 " member has changed in GCC 4.4");
6442 classes[1] = X86_64_SSESF_CLASS;
6443 return 2;
6445 case DCmode:
6446 classes[0] = X86_64_SSEDF_CLASS;
6447 classes[1] = X86_64_SSEDF_CLASS;
6448 return 2;
6449 case XCmode:
6450 classes[0] = X86_64_COMPLEX_X87_CLASS;
6451 return 1;
6452 case TCmode:
6453 /* This modes is larger than 16 bytes. */
6454 return 0;
6455 case V8SFmode:
6456 case V8SImode:
6457 case V32QImode:
6458 case V16HImode:
6459 case V4DFmode:
6460 case V4DImode:
6461 classes[0] = X86_64_SSE_CLASS;
6462 classes[1] = X86_64_SSEUP_CLASS;
6463 classes[2] = X86_64_SSEUP_CLASS;
6464 classes[3] = X86_64_SSEUP_CLASS;
6465 return 4;
6466 case V4SFmode:
6467 case V4SImode:
6468 case V16QImode:
6469 case V8HImode:
6470 case V2DFmode:
6471 case V2DImode:
6472 classes[0] = X86_64_SSE_CLASS;
6473 classes[1] = X86_64_SSEUP_CLASS;
6474 return 2;
6475 case V1TImode:
6476 case V1DImode:
6477 case V2SFmode:
6478 case V2SImode:
6479 case V4HImode:
6480 case V8QImode:
6481 classes[0] = X86_64_SSE_CLASS;
6482 return 1;
6483 case BLKmode:
6484 case VOIDmode:
6485 return 0;
6486 default:
6487 gcc_assert (VECTOR_MODE_P (mode));
6489 if (bytes > 16)
6490 return 0;
6492 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6494 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6495 classes[0] = X86_64_INTEGERSI_CLASS;
6496 else
6497 classes[0] = X86_64_INTEGER_CLASS;
6498 classes[1] = X86_64_INTEGER_CLASS;
6499 return 1 + (bytes > 8);
6503 /* Examine the argument and return set number of register required in each
6504 class. Return 0 iff parameter should be passed in memory. */
6505 static int
6506 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6507 int *int_nregs, int *sse_nregs)
6509 enum x86_64_reg_class regclass[MAX_CLASSES];
6510 int n = classify_argument (mode, type, regclass, 0);
6512 *int_nregs = 0;
6513 *sse_nregs = 0;
6514 if (!n)
6515 return 0;
6516 for (n--; n >= 0; n--)
6517 switch (regclass[n])
6519 case X86_64_INTEGER_CLASS:
6520 case X86_64_INTEGERSI_CLASS:
6521 (*int_nregs)++;
6522 break;
6523 case X86_64_SSE_CLASS:
6524 case X86_64_SSESF_CLASS:
6525 case X86_64_SSEDF_CLASS:
6526 (*sse_nregs)++;
6527 break;
6528 case X86_64_NO_CLASS:
6529 case X86_64_SSEUP_CLASS:
6530 break;
6531 case X86_64_X87_CLASS:
6532 case X86_64_X87UP_CLASS:
6533 if (!in_return)
6534 return 0;
6535 break;
6536 case X86_64_COMPLEX_X87_CLASS:
6537 return in_return ? 2 : 0;
6538 case X86_64_MEMORY_CLASS:
6539 gcc_unreachable ();
6541 return 1;
6544 /* Construct container for the argument used by GCC interface. See
6545 FUNCTION_ARG for the detailed description. */
6547 static rtx
6548 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6549 const_tree type, int in_return, int nintregs, int nsseregs,
6550 const int *intreg, int sse_regno)
6552 /* The following variables hold the static issued_error state. */
6553 static bool issued_sse_arg_error;
6554 static bool issued_sse_ret_error;
6555 static bool issued_x87_ret_error;
6557 enum machine_mode tmpmode;
6558 int bytes =
6559 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6560 enum x86_64_reg_class regclass[MAX_CLASSES];
6561 int n;
6562 int i;
6563 int nexps = 0;
6564 int needed_sseregs, needed_intregs;
6565 rtx exp[MAX_CLASSES];
6566 rtx ret;
6568 n = classify_argument (mode, type, regclass, 0);
6569 if (!n)
6570 return NULL;
6571 if (!examine_argument (mode, type, in_return, &needed_intregs,
6572 &needed_sseregs))
6573 return NULL;
6574 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6575 return NULL;
6577 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6578 some less clueful developer tries to use floating-point anyway. */
6579 if (needed_sseregs && !TARGET_SSE)
6581 if (in_return)
6583 if (!issued_sse_ret_error)
6585 error ("SSE register return with SSE disabled");
6586 issued_sse_ret_error = true;
6589 else if (!issued_sse_arg_error)
6591 error ("SSE register argument with SSE disabled");
6592 issued_sse_arg_error = true;
6594 return NULL;
6597 /* Likewise, error if the ABI requires us to return values in the
6598 x87 registers and the user specified -mno-80387. */
6599 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6600 for (i = 0; i < n; i++)
6601 if (regclass[i] == X86_64_X87_CLASS
6602 || regclass[i] == X86_64_X87UP_CLASS
6603 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6605 if (!issued_x87_ret_error)
6607 error ("x87 register return with x87 disabled");
6608 issued_x87_ret_error = true;
6610 return NULL;
6613 /* First construct simple cases. Avoid SCmode, since we want to use
6614 single register to pass this type. */
6615 if (n == 1 && mode != SCmode)
6616 switch (regclass[0])
6618 case X86_64_INTEGER_CLASS:
6619 case X86_64_INTEGERSI_CLASS:
6620 return gen_rtx_REG (mode, intreg[0]);
6621 case X86_64_SSE_CLASS:
6622 case X86_64_SSESF_CLASS:
6623 case X86_64_SSEDF_CLASS:
6624 if (mode != BLKmode)
6625 return gen_reg_or_parallel (mode, orig_mode,
6626 SSE_REGNO (sse_regno));
6627 break;
6628 case X86_64_X87_CLASS:
6629 case X86_64_COMPLEX_X87_CLASS:
6630 return gen_rtx_REG (mode, FIRST_STACK_REG);
6631 case X86_64_NO_CLASS:
6632 /* Zero sized array, struct or class. */
6633 return NULL;
6634 default:
6635 gcc_unreachable ();
6637 if (n == 2
6638 && regclass[0] == X86_64_SSE_CLASS
6639 && regclass[1] == X86_64_SSEUP_CLASS
6640 && mode != BLKmode)
6641 return gen_reg_or_parallel (mode, orig_mode,
6642 SSE_REGNO (sse_regno));
6643 if (n == 4
6644 && regclass[0] == X86_64_SSE_CLASS
6645 && regclass[1] == X86_64_SSEUP_CLASS
6646 && regclass[2] == X86_64_SSEUP_CLASS
6647 && regclass[3] == X86_64_SSEUP_CLASS
6648 && mode != BLKmode)
6649 return gen_reg_or_parallel (mode, orig_mode,
6650 SSE_REGNO (sse_regno));
6651 if (n == 2
6652 && regclass[0] == X86_64_X87_CLASS
6653 && regclass[1] == X86_64_X87UP_CLASS)
6654 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6656 if (n == 2
6657 && regclass[0] == X86_64_INTEGER_CLASS
6658 && regclass[1] == X86_64_INTEGER_CLASS
6659 && (mode == CDImode || mode == TImode || mode == TFmode)
6660 && intreg[0] + 1 == intreg[1])
6661 return gen_rtx_REG (mode, intreg[0]);
6663 /* Otherwise figure out the entries of the PARALLEL. */
6664 for (i = 0; i < n; i++)
6666 int pos;
6668 switch (regclass[i])
6670 case X86_64_NO_CLASS:
6671 break;
6672 case X86_64_INTEGER_CLASS:
6673 case X86_64_INTEGERSI_CLASS:
6674 /* Merge TImodes on aligned occasions here too. */
6675 if (i * 8 + 8 > bytes)
6676 tmpmode
6677 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6678 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6679 tmpmode = SImode;
6680 else
6681 tmpmode = DImode;
6682 /* We've requested 24 bytes we
6683 don't have mode for. Use DImode. */
6684 if (tmpmode == BLKmode)
6685 tmpmode = DImode;
6686 exp [nexps++]
6687 = gen_rtx_EXPR_LIST (VOIDmode,
6688 gen_rtx_REG (tmpmode, *intreg),
6689 GEN_INT (i*8));
6690 intreg++;
6691 break;
6692 case X86_64_SSESF_CLASS:
6693 exp [nexps++]
6694 = gen_rtx_EXPR_LIST (VOIDmode,
6695 gen_rtx_REG (SFmode,
6696 SSE_REGNO (sse_regno)),
6697 GEN_INT (i*8));
6698 sse_regno++;
6699 break;
6700 case X86_64_SSEDF_CLASS:
6701 exp [nexps++]
6702 = gen_rtx_EXPR_LIST (VOIDmode,
6703 gen_rtx_REG (DFmode,
6704 SSE_REGNO (sse_regno)),
6705 GEN_INT (i*8));
6706 sse_regno++;
6707 break;
6708 case X86_64_SSE_CLASS:
6709 pos = i;
6710 switch (n)
6712 case 1:
6713 tmpmode = DImode;
6714 break;
6715 case 2:
6716 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6718 tmpmode = TImode;
6719 i++;
6721 else
6722 tmpmode = DImode;
6723 break;
6724 case 4:
6725 gcc_assert (i == 0
6726 && regclass[1] == X86_64_SSEUP_CLASS
6727 && regclass[2] == X86_64_SSEUP_CLASS
6728 && regclass[3] == X86_64_SSEUP_CLASS);
6729 tmpmode = OImode;
6730 i += 3;
6731 break;
6732 default:
6733 gcc_unreachable ();
6735 exp [nexps++]
6736 = gen_rtx_EXPR_LIST (VOIDmode,
6737 gen_rtx_REG (tmpmode,
6738 SSE_REGNO (sse_regno)),
6739 GEN_INT (pos*8));
6740 sse_regno++;
6741 break;
6742 default:
6743 gcc_unreachable ();
6747 /* Empty aligned struct, union or class. */
6748 if (nexps == 0)
6749 return NULL;
6751 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6752 for (i = 0; i < nexps; i++)
6753 XVECEXP (ret, 0, i) = exp [i];
6754 return ret;
6757 /* Update the data in CUM to advance over an argument of mode MODE
6758 and data type TYPE. (TYPE is null for libcalls where that information
6759 may not be available.) */
6761 static void
6762 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6763 const_tree type, HOST_WIDE_INT bytes,
6764 HOST_WIDE_INT words)
6766 switch (mode)
6768 default:
6769 break;
6771 case BLKmode:
6772 if (bytes < 0)
6773 break;
6774 /* FALLTHRU */
6776 case DImode:
6777 case SImode:
6778 case HImode:
6779 case QImode:
6780 cum->words += words;
6781 cum->nregs -= words;
6782 cum->regno += words;
6784 if (cum->nregs <= 0)
6786 cum->nregs = 0;
6787 cum->regno = 0;
6789 break;
6791 case OImode:
6792 /* OImode shouldn't be used directly. */
6793 gcc_unreachable ();
6795 case DFmode:
6796 if (cum->float_in_sse < 2)
6797 break;
6798 case SFmode:
6799 if (cum->float_in_sse < 1)
6800 break;
6801 /* FALLTHRU */
6803 case V8SFmode:
6804 case V8SImode:
6805 case V32QImode:
6806 case V16HImode:
6807 case V4DFmode:
6808 case V4DImode:
6809 case TImode:
6810 case V16QImode:
6811 case V8HImode:
6812 case V4SImode:
6813 case V2DImode:
6814 case V4SFmode:
6815 case V2DFmode:
6816 if (!type || !AGGREGATE_TYPE_P (type))
6818 cum->sse_words += words;
6819 cum->sse_nregs -= 1;
6820 cum->sse_regno += 1;
6821 if (cum->sse_nregs <= 0)
6823 cum->sse_nregs = 0;
6824 cum->sse_regno = 0;
6827 break;
6829 case V8QImode:
6830 case V4HImode:
6831 case V2SImode:
6832 case V2SFmode:
6833 case V1TImode:
6834 case V1DImode:
6835 if (!type || !AGGREGATE_TYPE_P (type))
6837 cum->mmx_words += words;
6838 cum->mmx_nregs -= 1;
6839 cum->mmx_regno += 1;
6840 if (cum->mmx_nregs <= 0)
6842 cum->mmx_nregs = 0;
6843 cum->mmx_regno = 0;
6846 break;
6850 static void
6851 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6852 const_tree type, HOST_WIDE_INT words, bool named)
6854 int int_nregs, sse_nregs;
6856 /* Unnamed 256bit vector mode parameters are passed on stack. */
6857 if (!named && VALID_AVX256_REG_MODE (mode))
6858 return;
6860 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6861 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6863 cum->nregs -= int_nregs;
6864 cum->sse_nregs -= sse_nregs;
6865 cum->regno += int_nregs;
6866 cum->sse_regno += sse_nregs;
6868 else
6870 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6871 cum->words = (cum->words + align - 1) & ~(align - 1);
6872 cum->words += words;
6876 static void
6877 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6878 HOST_WIDE_INT words)
6880 /* Otherwise, this should be passed indirect. */
6881 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6883 cum->words += words;
6884 if (cum->nregs > 0)
6886 cum->nregs -= 1;
6887 cum->regno += 1;
6891 /* Update the data in CUM to advance over an argument of mode MODE and
6892 data type TYPE. (TYPE is null for libcalls where that information
6893 may not be available.) */
6895 static void
6896 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6897 const_tree type, bool named)
6899 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6900 HOST_WIDE_INT bytes, words;
6902 if (mode == BLKmode)
6903 bytes = int_size_in_bytes (type);
6904 else
6905 bytes = GET_MODE_SIZE (mode);
6906 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6908 if (type)
6909 mode = type_natural_mode (type, NULL);
6911 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6912 function_arg_advance_ms_64 (cum, bytes, words);
6913 else if (TARGET_64BIT)
6914 function_arg_advance_64 (cum, mode, type, words, named);
6915 else
6916 function_arg_advance_32 (cum, mode, type, bytes, words);
6919 /* Define where to put the arguments to a function.
6920 Value is zero to push the argument on the stack,
6921 or a hard register in which to store the argument.
6923 MODE is the argument's machine mode.
6924 TYPE is the data type of the argument (as a tree).
6925 This is null for libcalls where that information may
6926 not be available.
6927 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6928 the preceding args and about the function being called.
6929 NAMED is nonzero if this argument is a named parameter
6930 (otherwise it is an extra parameter matching an ellipsis). */
6932 static rtx
6933 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6934 enum machine_mode orig_mode, const_tree type,
6935 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6937 static bool warnedsse, warnedmmx;
6939 /* Avoid the AL settings for the Unix64 ABI. */
6940 if (mode == VOIDmode)
6941 return constm1_rtx;
6943 switch (mode)
6945 default:
6946 break;
6948 case BLKmode:
6949 if (bytes < 0)
6950 break;
6951 /* FALLTHRU */
6952 case DImode:
6953 case SImode:
6954 case HImode:
6955 case QImode:
6956 if (words <= cum->nregs)
6958 int regno = cum->regno;
6960 /* Fastcall allocates the first two DWORD (SImode) or
6961 smaller arguments to ECX and EDX if it isn't an
6962 aggregate type . */
6963 if (cum->fastcall)
6965 if (mode == BLKmode
6966 || mode == DImode
6967 || (type && AGGREGATE_TYPE_P (type)))
6968 break;
6970 /* ECX not EAX is the first allocated register. */
6971 if (regno == AX_REG)
6972 regno = CX_REG;
6974 return gen_rtx_REG (mode, regno);
6976 break;
6978 case DFmode:
6979 if (cum->float_in_sse < 2)
6980 break;
6981 case SFmode:
6982 if (cum->float_in_sse < 1)
6983 break;
6984 /* FALLTHRU */
6985 case TImode:
6986 /* In 32bit, we pass TImode in xmm registers. */
6987 case V16QImode:
6988 case V8HImode:
6989 case V4SImode:
6990 case V2DImode:
6991 case V4SFmode:
6992 case V2DFmode:
6993 if (!type || !AGGREGATE_TYPE_P (type))
6995 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6997 warnedsse = true;
6998 warning (0, "SSE vector argument without SSE enabled "
6999 "changes the ABI");
7001 if (cum->sse_nregs)
7002 return gen_reg_or_parallel (mode, orig_mode,
7003 cum->sse_regno + FIRST_SSE_REG);
7005 break;
7007 case OImode:
7008 /* OImode shouldn't be used directly. */
7009 gcc_unreachable ();
7011 case V8SFmode:
7012 case V8SImode:
7013 case V32QImode:
7014 case V16HImode:
7015 case V4DFmode:
7016 case V4DImode:
7017 if (!type || !AGGREGATE_TYPE_P (type))
7019 if (cum->sse_nregs)
7020 return gen_reg_or_parallel (mode, orig_mode,
7021 cum->sse_regno + FIRST_SSE_REG);
7023 break;
7025 case V8QImode:
7026 case V4HImode:
7027 case V2SImode:
7028 case V2SFmode:
7029 case V1TImode:
7030 case V1DImode:
7031 if (!type || !AGGREGATE_TYPE_P (type))
7033 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7035 warnedmmx = true;
7036 warning (0, "MMX vector argument without MMX enabled "
7037 "changes the ABI");
7039 if (cum->mmx_nregs)
7040 return gen_reg_or_parallel (mode, orig_mode,
7041 cum->mmx_regno + FIRST_MMX_REG);
7043 break;
7046 return NULL_RTX;
7049 static rtx
7050 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7051 enum machine_mode orig_mode, const_tree type, bool named)
7053 /* Handle a hidden AL argument containing number of registers
7054 for varargs x86-64 functions. */
7055 if (mode == VOIDmode)
7056 return GEN_INT (cum->maybe_vaarg
7057 ? (cum->sse_nregs < 0
7058 ? X86_64_SSE_REGPARM_MAX
7059 : cum->sse_regno)
7060 : -1);
7062 switch (mode)
7064 default:
7065 break;
7067 case V8SFmode:
7068 case V8SImode:
7069 case V32QImode:
7070 case V16HImode:
7071 case V4DFmode:
7072 case V4DImode:
7073 /* Unnamed 256bit vector mode parameters are passed on stack. */
7074 if (!named)
7075 return NULL;
7076 break;
7079 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7080 cum->sse_nregs,
7081 &x86_64_int_parameter_registers [cum->regno],
7082 cum->sse_regno);
7085 static rtx
7086 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7087 enum machine_mode orig_mode, bool named,
7088 HOST_WIDE_INT bytes)
7090 unsigned int regno;
7092 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7093 We use value of -2 to specify that current function call is MSABI. */
7094 if (mode == VOIDmode)
7095 return GEN_INT (-2);
7097 /* If we've run out of registers, it goes on the stack. */
7098 if (cum->nregs == 0)
7099 return NULL_RTX;
7101 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7103 /* Only floating point modes are passed in anything but integer regs. */
7104 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7106 if (named)
7107 regno = cum->regno + FIRST_SSE_REG;
7108 else
7110 rtx t1, t2;
7112 /* Unnamed floating parameters are passed in both the
7113 SSE and integer registers. */
7114 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7115 t2 = gen_rtx_REG (mode, regno);
7116 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7117 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7118 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7121 /* Handle aggregated types passed in register. */
7122 if (orig_mode == BLKmode)
7124 if (bytes > 0 && bytes <= 8)
7125 mode = (bytes > 4 ? DImode : SImode);
7126 if (mode == BLKmode)
7127 mode = DImode;
7130 return gen_reg_or_parallel (mode, orig_mode, regno);
7133 /* Return where to put the arguments to a function.
7134 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7136 MODE is the argument's machine mode. TYPE is the data type of the
7137 argument. It is null for libcalls where that information may not be
7138 available. CUM gives information about the preceding args and about
7139 the function being called. NAMED is nonzero if this argument is a
7140 named parameter (otherwise it is an extra parameter matching an
7141 ellipsis). */
7143 static rtx
7144 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7145 const_tree type, bool named)
7147 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7148 enum machine_mode mode = omode;
7149 HOST_WIDE_INT bytes, words;
7150 rtx arg;
7152 if (mode == BLKmode)
7153 bytes = int_size_in_bytes (type);
7154 else
7155 bytes = GET_MODE_SIZE (mode);
7156 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7158 /* To simplify the code below, represent vector types with a vector mode
7159 even if MMX/SSE are not active. */
7160 if (type && TREE_CODE (type) == VECTOR_TYPE)
7161 mode = type_natural_mode (type, cum);
7163 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7164 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7165 else if (TARGET_64BIT)
7166 arg = function_arg_64 (cum, mode, omode, type, named);
7167 else
7168 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7170 return arg;
7173 /* A C expression that indicates when an argument must be passed by
7174 reference. If nonzero for an argument, a copy of that argument is
7175 made in memory and a pointer to the argument is passed instead of
7176 the argument itself. The pointer is passed in whatever way is
7177 appropriate for passing a pointer to that type. */
7179 static bool
7180 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7181 const_tree type, bool named ATTRIBUTE_UNUSED)
7183 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7185 /* See Windows x64 Software Convention. */
7186 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7188 int msize = (int) GET_MODE_SIZE (mode);
7189 if (type)
7191 /* Arrays are passed by reference. */
7192 if (TREE_CODE (type) == ARRAY_TYPE)
7193 return true;
7195 if (AGGREGATE_TYPE_P (type))
7197 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7198 are passed by reference. */
7199 msize = int_size_in_bytes (type);
7203 /* __m128 is passed by reference. */
7204 switch (msize) {
7205 case 1: case 2: case 4: case 8:
7206 break;
7207 default:
7208 return true;
7211 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7212 return 1;
7214 return 0;
7217 /* Return true when TYPE should be 128bit aligned for 32bit argument
7218 passing ABI. XXX: This function is obsolete and is only used for
7219 checking psABI compatibility with previous versions of GCC. */
7221 static bool
7222 ix86_compat_aligned_value_p (const_tree type)
7224 enum machine_mode mode = TYPE_MODE (type);
7225 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7226 || mode == TDmode
7227 || mode == TFmode
7228 || mode == TCmode)
7229 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7230 return true;
7231 if (TYPE_ALIGN (type) < 128)
7232 return false;
7234 if (AGGREGATE_TYPE_P (type))
7236 /* Walk the aggregates recursively. */
7237 switch (TREE_CODE (type))
7239 case RECORD_TYPE:
7240 case UNION_TYPE:
7241 case QUAL_UNION_TYPE:
7243 tree field;
7245 /* Walk all the structure fields. */
7246 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7248 if (TREE_CODE (field) == FIELD_DECL
7249 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7250 return true;
7252 break;
7255 case ARRAY_TYPE:
7256 /* Just for use if some languages passes arrays by value. */
7257 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7258 return true;
7259 break;
7261 default:
7262 gcc_unreachable ();
7265 return false;
7268 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7269 XXX: This function is obsolete and is only used for checking psABI
7270 compatibility with previous versions of GCC. */
7272 static unsigned int
7273 ix86_compat_function_arg_boundary (enum machine_mode mode,
7274 const_tree type, unsigned int align)
7276 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7277 natural boundaries. */
7278 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7280 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7281 make an exception for SSE modes since these require 128bit
7282 alignment.
7284 The handling here differs from field_alignment. ICC aligns MMX
7285 arguments to 4 byte boundaries, while structure fields are aligned
7286 to 8 byte boundaries. */
7287 if (!type)
7289 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7290 align = PARM_BOUNDARY;
7292 else
7294 if (!ix86_compat_aligned_value_p (type))
7295 align = PARM_BOUNDARY;
7298 if (align > BIGGEST_ALIGNMENT)
7299 align = BIGGEST_ALIGNMENT;
7300 return align;
7303 /* Return true when TYPE should be 128bit aligned for 32bit argument
7304 passing ABI. */
7306 static bool
7307 ix86_contains_aligned_value_p (const_tree type)
7309 enum machine_mode mode = TYPE_MODE (type);
7311 if (mode == XFmode || mode == XCmode)
7312 return false;
7314 if (TYPE_ALIGN (type) < 128)
7315 return false;
7317 if (AGGREGATE_TYPE_P (type))
7319 /* Walk the aggregates recursively. */
7320 switch (TREE_CODE (type))
7322 case RECORD_TYPE:
7323 case UNION_TYPE:
7324 case QUAL_UNION_TYPE:
7326 tree field;
7328 /* Walk all the structure fields. */
7329 for (field = TYPE_FIELDS (type);
7330 field;
7331 field = DECL_CHAIN (field))
7333 if (TREE_CODE (field) == FIELD_DECL
7334 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7335 return true;
7337 break;
7340 case ARRAY_TYPE:
7341 /* Just for use if some languages passes arrays by value. */
7342 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7343 return true;
7344 break;
7346 default:
7347 gcc_unreachable ();
7350 else
7351 return TYPE_ALIGN (type) >= 128;
7353 return false;
7356 /* Gives the alignment boundary, in bits, of an argument with the
7357 specified mode and type. */
7359 static unsigned int
7360 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7362 unsigned int align;
7363 if (type)
7365 /* Since the main variant type is used for call, we convert it to
7366 the main variant type. */
7367 type = TYPE_MAIN_VARIANT (type);
7368 align = TYPE_ALIGN (type);
7370 else
7371 align = GET_MODE_ALIGNMENT (mode);
7372 if (align < PARM_BOUNDARY)
7373 align = PARM_BOUNDARY;
7374 else
7376 static bool warned;
7377 unsigned int saved_align = align;
7379 if (!TARGET_64BIT)
7381 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7382 if (!type)
7384 if (mode == XFmode || mode == XCmode)
7385 align = PARM_BOUNDARY;
7387 else if (!ix86_contains_aligned_value_p (type))
7388 align = PARM_BOUNDARY;
7390 if (align < 128)
7391 align = PARM_BOUNDARY;
7394 if (warn_psabi
7395 && !warned
7396 && align != ix86_compat_function_arg_boundary (mode, type,
7397 saved_align))
7399 warned = true;
7400 inform (input_location,
7401 "The ABI for passing parameters with %d-byte"
7402 " alignment has changed in GCC 4.6",
7403 align / BITS_PER_UNIT);
7407 return align;
7410 /* Return true if N is a possible register number of function value. */
7412 static bool
7413 ix86_function_value_regno_p (const unsigned int regno)
7415 switch (regno)
7417 case AX_REG:
7418 case DX_REG:
7419 return true;
7420 case DI_REG:
7421 case SI_REG:
7422 return TARGET_64BIT && ix86_abi != MS_ABI;
7424 /* Complex values are returned in %st(0)/%st(1) pair. */
7425 case ST0_REG:
7426 case ST1_REG:
7427 /* TODO: The function should depend on current function ABI but
7428 builtins.c would need updating then. Therefore we use the
7429 default ABI. */
7430 if (TARGET_64BIT && ix86_abi == MS_ABI)
7431 return false;
7432 return TARGET_FLOAT_RETURNS_IN_80387;
7434 /* Complex values are returned in %xmm0/%xmm1 pair. */
7435 case XMM0_REG:
7436 case XMM1_REG:
7437 return TARGET_SSE;
7439 case MM0_REG:
7440 if (TARGET_MACHO || TARGET_64BIT)
7441 return false;
7442 return TARGET_MMX;
7445 return false;
7448 /* Define how to find the value returned by a function.
7449 VALTYPE is the data type of the value (as a tree).
7450 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7451 otherwise, FUNC is 0. */
7453 static rtx
7454 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7455 const_tree fntype, const_tree fn)
7457 unsigned int regno;
7459 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7460 we normally prevent this case when mmx is not available. However
7461 some ABIs may require the result to be returned like DImode. */
7462 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7463 regno = FIRST_MMX_REG;
7465 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7466 we prevent this case when sse is not available. However some ABIs
7467 may require the result to be returned like integer TImode. */
7468 else if (mode == TImode
7469 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7470 regno = FIRST_SSE_REG;
7472 /* 32-byte vector modes in %ymm0. */
7473 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7474 regno = FIRST_SSE_REG;
7476 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7477 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7478 regno = FIRST_FLOAT_REG;
7479 else
7480 /* Most things go in %eax. */
7481 regno = AX_REG;
7483 /* Override FP return register with %xmm0 for local functions when
7484 SSE math is enabled or for functions with sseregparm attribute. */
7485 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7487 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7488 if ((sse_level >= 1 && mode == SFmode)
7489 || (sse_level == 2 && mode == DFmode))
7490 regno = FIRST_SSE_REG;
7493 /* OImode shouldn't be used directly. */
7494 gcc_assert (mode != OImode);
7496 return gen_rtx_REG (orig_mode, regno);
7499 static rtx
7500 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7501 const_tree valtype)
7503 rtx ret;
7505 /* Handle libcalls, which don't provide a type node. */
7506 if (valtype == NULL)
7508 unsigned int regno;
7510 switch (mode)
7512 case SFmode:
7513 case SCmode:
7514 case DFmode:
7515 case DCmode:
7516 case TFmode:
7517 case SDmode:
7518 case DDmode:
7519 case TDmode:
7520 regno = FIRST_SSE_REG;
7521 break;
7522 case XFmode:
7523 case XCmode:
7524 regno = FIRST_FLOAT_REG;
7525 break;
7526 case TCmode:
7527 return NULL;
7528 default:
7529 regno = AX_REG;
7532 return gen_rtx_REG (mode, regno);
7534 else if (POINTER_TYPE_P (valtype)
7535 && !upc_shared_type_p (TREE_TYPE (valtype)))
7537 /* Pointers are always returned in word_mode. */
7538 mode = word_mode;
7541 ret = construct_container (mode, orig_mode, valtype, 1,
7542 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7543 x86_64_int_return_registers, 0);
7545 /* For zero sized structures, construct_container returns NULL, but we
7546 need to keep rest of compiler happy by returning meaningful value. */
7547 if (!ret)
7548 ret = gen_rtx_REG (orig_mode, AX_REG);
7550 return ret;
7553 static rtx
7554 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7555 const_tree valtype)
7557 unsigned int regno = AX_REG;
7559 if (TARGET_SSE)
7561 switch (GET_MODE_SIZE (mode))
7563 case 16:
7564 if (valtype != NULL_TREE
7565 && !VECTOR_INTEGER_TYPE_P (valtype)
7566 && !VECTOR_INTEGER_TYPE_P (valtype)
7567 && !INTEGRAL_TYPE_P (valtype)
7568 && !VECTOR_FLOAT_TYPE_P (valtype))
7569 break;
7570 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7571 && !COMPLEX_MODE_P (mode))
7572 regno = FIRST_SSE_REG;
7573 break;
7574 case 8:
7575 case 4:
7576 if (mode == SFmode || mode == DFmode)
7577 regno = FIRST_SSE_REG;
7578 break;
7579 default:
7580 break;
7583 return gen_rtx_REG (orig_mode, regno);
7586 static rtx
7587 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7588 enum machine_mode orig_mode, enum machine_mode mode)
7590 const_tree fn, fntype;
7592 fn = NULL_TREE;
7593 if (fntype_or_decl && DECL_P (fntype_or_decl))
7594 fn = fntype_or_decl;
7595 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7597 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7598 return function_value_ms_64 (orig_mode, mode, valtype);
7599 else if (TARGET_64BIT)
7600 return function_value_64 (orig_mode, mode, valtype);
7601 else
7602 return function_value_32 (orig_mode, mode, fntype, fn);
7605 static rtx
7606 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7607 bool outgoing ATTRIBUTE_UNUSED)
7609 enum machine_mode mode, orig_mode;
7611 orig_mode = TYPE_MODE (valtype);
7612 mode = type_natural_mode (valtype, NULL);
7613 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7616 /* Pointer function arguments and return values are promoted to
7617 word_mode. */
7619 static enum machine_mode
7620 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7621 int *punsignedp, const_tree fntype,
7622 int for_return)
7624 if (type != NULL_TREE && POINTER_TYPE_P (type))
7626 if (upc_shared_type_p (TREE_TYPE (type)))
7628 *punsignedp = 1;
7629 return TYPE_MODE (upc_pts_rep_type_node);
7631 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7632 return word_mode;
7634 return default_promote_function_mode (type, mode, punsignedp, fntype,
7635 for_return);
7638 /* Return true if a structure, union or array with MODE containing FIELD
7639 should be accessed using BLKmode. */
7641 static bool
7642 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7644 /* Union with XFmode must be in BLKmode. */
7645 return (mode == XFmode
7646 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7647 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7651 ix86_libcall_value (enum machine_mode mode)
7653 return ix86_function_value_1 (NULL, NULL, mode, mode);
7656 /* Return true iff type is returned in memory. */
7658 static bool ATTRIBUTE_UNUSED
7659 return_in_memory_32 (const_tree type, enum machine_mode mode)
7661 HOST_WIDE_INT size;
7663 if (mode == BLKmode)
7664 return true;
7666 size = int_size_in_bytes (type);
7668 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7669 return false;
7671 if (VECTOR_MODE_P (mode) || mode == TImode)
7673 /* User-created vectors small enough to fit in EAX. */
7674 if (size < 8)
7675 return false;
7677 /* MMX/3dNow values are returned in MM0,
7678 except when it doesn't exits or the ABI prescribes otherwise. */
7679 if (size == 8)
7680 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7682 /* SSE values are returned in XMM0, except when it doesn't exist. */
7683 if (size == 16)
7684 return !TARGET_SSE;
7686 /* AVX values are returned in YMM0, except when it doesn't exist. */
7687 if (size == 32)
7688 return !TARGET_AVX;
7691 if (mode == XFmode)
7692 return false;
7694 if (size > 12)
7695 return true;
7697 /* OImode shouldn't be used directly. */
7698 gcc_assert (mode != OImode);
7700 return false;
7703 static bool ATTRIBUTE_UNUSED
7704 return_in_memory_64 (const_tree type, enum machine_mode mode)
7706 int needed_intregs, needed_sseregs;
7707 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7710 static bool ATTRIBUTE_UNUSED
7711 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7713 HOST_WIDE_INT size = int_size_in_bytes (type);
7715 /* __m128 is returned in xmm0. */
7716 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7717 || VECTOR_FLOAT_TYPE_P (type))
7718 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7719 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7720 return false;
7722 /* Otherwise, the size must be exactly in [1248]. */
7723 return size != 1 && size != 2 && size != 4 && size != 8;
7726 static bool
7727 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7729 #ifdef SUBTARGET_RETURN_IN_MEMORY
7730 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7731 #else
7732 const enum machine_mode mode = type_natural_mode (type, NULL);
7734 if (TARGET_64BIT)
7736 if (ix86_function_type_abi (fntype) == MS_ABI)
7737 return return_in_memory_ms_64 (type, mode);
7738 else
7739 return return_in_memory_64 (type, mode);
7741 else
7742 return return_in_memory_32 (type, mode);
7743 #endif
7746 /* When returning SSE vector types, we have a choice of either
7747 (1) being abi incompatible with a -march switch, or
7748 (2) generating an error.
7749 Given no good solution, I think the safest thing is one warning.
7750 The user won't be able to use -Werror, but....
7752 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7753 called in response to actually generating a caller or callee that
7754 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7755 via aggregate_value_p for general type probing from tree-ssa. */
7757 static rtx
7758 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7760 static bool warnedsse, warnedmmx;
7762 if (!TARGET_64BIT && type)
7764 /* Look at the return type of the function, not the function type. */
7765 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7767 if (!TARGET_SSE && !warnedsse)
7769 if (mode == TImode
7770 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7772 warnedsse = true;
7773 warning (0, "SSE vector return without SSE enabled "
7774 "changes the ABI");
7778 if (!TARGET_MMX && !warnedmmx)
7780 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7782 warnedmmx = true;
7783 warning (0, "MMX vector return without MMX enabled "
7784 "changes the ABI");
7789 return NULL;
7793 /* Create the va_list data type. */
7795 /* Returns the calling convention specific va_list date type.
7796 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7798 static tree
7799 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7801 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7803 /* For i386 we use plain pointer to argument area. */
7804 if (!TARGET_64BIT || abi == MS_ABI)
7805 return build_pointer_type (char_type_node);
7807 record = lang_hooks.types.make_type (RECORD_TYPE);
7808 type_decl = build_decl (BUILTINS_LOCATION,
7809 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7811 f_gpr = build_decl (BUILTINS_LOCATION,
7812 FIELD_DECL, get_identifier ("gp_offset"),
7813 unsigned_type_node);
7814 f_fpr = build_decl (BUILTINS_LOCATION,
7815 FIELD_DECL, get_identifier ("fp_offset"),
7816 unsigned_type_node);
7817 f_ovf = build_decl (BUILTINS_LOCATION,
7818 FIELD_DECL, get_identifier ("overflow_arg_area"),
7819 ptr_type_node);
7820 f_sav = build_decl (BUILTINS_LOCATION,
7821 FIELD_DECL, get_identifier ("reg_save_area"),
7822 ptr_type_node);
7824 va_list_gpr_counter_field = f_gpr;
7825 va_list_fpr_counter_field = f_fpr;
7827 DECL_FIELD_CONTEXT (f_gpr) = record;
7828 DECL_FIELD_CONTEXT (f_fpr) = record;
7829 DECL_FIELD_CONTEXT (f_ovf) = record;
7830 DECL_FIELD_CONTEXT (f_sav) = record;
7832 TYPE_STUB_DECL (record) = type_decl;
7833 TYPE_NAME (record) = type_decl;
7834 TYPE_FIELDS (record) = f_gpr;
7835 DECL_CHAIN (f_gpr) = f_fpr;
7836 DECL_CHAIN (f_fpr) = f_ovf;
7837 DECL_CHAIN (f_ovf) = f_sav;
7839 layout_type (record);
7841 /* The correct type is an array type of one element. */
7842 return build_array_type (record, build_index_type (size_zero_node));
7845 /* Setup the builtin va_list data type and for 64-bit the additional
7846 calling convention specific va_list data types. */
7848 static tree
7849 ix86_build_builtin_va_list (void)
7851 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7853 /* Initialize abi specific va_list builtin types. */
7854 if (TARGET_64BIT)
7856 tree t;
7857 if (ix86_abi == MS_ABI)
7859 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7860 if (TREE_CODE (t) != RECORD_TYPE)
7861 t = build_variant_type_copy (t);
7862 sysv_va_list_type_node = t;
7864 else
7866 t = ret;
7867 if (TREE_CODE (t) != RECORD_TYPE)
7868 t = build_variant_type_copy (t);
7869 sysv_va_list_type_node = t;
7871 if (ix86_abi != MS_ABI)
7873 t = ix86_build_builtin_va_list_abi (MS_ABI);
7874 if (TREE_CODE (t) != RECORD_TYPE)
7875 t = build_variant_type_copy (t);
7876 ms_va_list_type_node = t;
7878 else
7880 t = ret;
7881 if (TREE_CODE (t) != RECORD_TYPE)
7882 t = build_variant_type_copy (t);
7883 ms_va_list_type_node = t;
7887 return ret;
7890 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7892 static void
7893 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7895 rtx save_area, mem;
7896 alias_set_type set;
7897 int i, max;
7899 /* GPR size of varargs save area. */
7900 if (cfun->va_list_gpr_size)
7901 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7902 else
7903 ix86_varargs_gpr_size = 0;
7905 /* FPR size of varargs save area. We don't need it if we don't pass
7906 anything in SSE registers. */
7907 if (TARGET_SSE && cfun->va_list_fpr_size)
7908 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7909 else
7910 ix86_varargs_fpr_size = 0;
7912 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7913 return;
7915 save_area = frame_pointer_rtx;
7916 set = get_varargs_alias_set ();
7918 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7919 if (max > X86_64_REGPARM_MAX)
7920 max = X86_64_REGPARM_MAX;
7922 for (i = cum->regno; i < max; i++)
7924 mem = gen_rtx_MEM (word_mode,
7925 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7926 MEM_NOTRAP_P (mem) = 1;
7927 set_mem_alias_set (mem, set);
7928 emit_move_insn (mem,
7929 gen_rtx_REG (word_mode,
7930 x86_64_int_parameter_registers[i]));
7933 if (ix86_varargs_fpr_size)
7935 enum machine_mode smode;
7936 rtx label, test;
7938 /* Now emit code to save SSE registers. The AX parameter contains number
7939 of SSE parameter registers used to call this function, though all we
7940 actually check here is the zero/non-zero status. */
7942 label = gen_label_rtx ();
7943 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7944 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7945 label));
7947 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7948 we used movdqa (i.e. TImode) instead? Perhaps even better would
7949 be if we could determine the real mode of the data, via a hook
7950 into pass_stdarg. Ignore all that for now. */
7951 smode = V4SFmode;
7952 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7953 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7955 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7956 if (max > X86_64_SSE_REGPARM_MAX)
7957 max = X86_64_SSE_REGPARM_MAX;
7959 for (i = cum->sse_regno; i < max; ++i)
7961 mem = plus_constant (Pmode, save_area,
7962 i * 16 + ix86_varargs_gpr_size);
7963 mem = gen_rtx_MEM (smode, mem);
7964 MEM_NOTRAP_P (mem) = 1;
7965 set_mem_alias_set (mem, set);
7966 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7968 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7971 emit_label (label);
7975 static void
7976 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7978 alias_set_type set = get_varargs_alias_set ();
7979 int i;
7981 /* Reset to zero, as there might be a sysv vaarg used
7982 before. */
7983 ix86_varargs_gpr_size = 0;
7984 ix86_varargs_fpr_size = 0;
7986 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7988 rtx reg, mem;
7990 mem = gen_rtx_MEM (Pmode,
7991 plus_constant (Pmode, virtual_incoming_args_rtx,
7992 i * UNITS_PER_WORD));
7993 MEM_NOTRAP_P (mem) = 1;
7994 set_mem_alias_set (mem, set);
7996 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7997 emit_move_insn (mem, reg);
8001 static void
8002 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8003 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8004 int no_rtl)
8006 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8007 CUMULATIVE_ARGS next_cum;
8008 tree fntype;
8010 /* This argument doesn't appear to be used anymore. Which is good,
8011 because the old code here didn't suppress rtl generation. */
8012 gcc_assert (!no_rtl);
8014 if (!TARGET_64BIT)
8015 return;
8017 fntype = TREE_TYPE (current_function_decl);
8019 /* For varargs, we do not want to skip the dummy va_dcl argument.
8020 For stdargs, we do want to skip the last named argument. */
8021 next_cum = *cum;
8022 if (stdarg_p (fntype))
8023 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8024 true);
8026 if (cum->call_abi == MS_ABI)
8027 setup_incoming_varargs_ms_64 (&next_cum);
8028 else
8029 setup_incoming_varargs_64 (&next_cum);
8032 /* Checks if TYPE is of kind va_list char *. */
8034 static bool
8035 is_va_list_char_pointer (tree type)
8037 tree canonic;
8039 /* For 32-bit it is always true. */
8040 if (!TARGET_64BIT)
8041 return true;
8042 canonic = ix86_canonical_va_list_type (type);
8043 return (canonic == ms_va_list_type_node
8044 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8047 /* Implement va_start. */
8049 static void
8050 ix86_va_start (tree valist, rtx nextarg)
8052 HOST_WIDE_INT words, n_gpr, n_fpr;
8053 tree f_gpr, f_fpr, f_ovf, f_sav;
8054 tree gpr, fpr, ovf, sav, t;
8055 tree type;
8056 rtx ovf_rtx;
8058 if (flag_split_stack
8059 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8061 unsigned int scratch_regno;
8063 /* When we are splitting the stack, we can't refer to the stack
8064 arguments using internal_arg_pointer, because they may be on
8065 the old stack. The split stack prologue will arrange to
8066 leave a pointer to the old stack arguments in a scratch
8067 register, which we here copy to a pseudo-register. The split
8068 stack prologue can't set the pseudo-register directly because
8069 it (the prologue) runs before any registers have been saved. */
8071 scratch_regno = split_stack_prologue_scratch_regno ();
8072 if (scratch_regno != INVALID_REGNUM)
8074 rtx reg, seq;
8076 reg = gen_reg_rtx (Pmode);
8077 cfun->machine->split_stack_varargs_pointer = reg;
8079 start_sequence ();
8080 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8081 seq = get_insns ();
8082 end_sequence ();
8084 push_topmost_sequence ();
8085 emit_insn_after (seq, entry_of_function ());
8086 pop_topmost_sequence ();
8090 /* Only 64bit target needs something special. */
8091 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8093 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8094 std_expand_builtin_va_start (valist, nextarg);
8095 else
8097 rtx va_r, next;
8099 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8100 next = expand_binop (ptr_mode, add_optab,
8101 cfun->machine->split_stack_varargs_pointer,
8102 crtl->args.arg_offset_rtx,
8103 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8104 convert_move (va_r, next, 0);
8106 return;
8109 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8110 f_fpr = DECL_CHAIN (f_gpr);
8111 f_ovf = DECL_CHAIN (f_fpr);
8112 f_sav = DECL_CHAIN (f_ovf);
8114 valist = build_simple_mem_ref (valist);
8115 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8116 /* The following should be folded into the MEM_REF offset. */
8117 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8118 f_gpr, NULL_TREE);
8119 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8120 f_fpr, NULL_TREE);
8121 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8122 f_ovf, NULL_TREE);
8123 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8124 f_sav, NULL_TREE);
8126 /* Count number of gp and fp argument registers used. */
8127 words = crtl->args.info.words;
8128 n_gpr = crtl->args.info.regno;
8129 n_fpr = crtl->args.info.sse_regno;
8131 if (cfun->va_list_gpr_size)
8133 type = TREE_TYPE (gpr);
8134 t = build2 (MODIFY_EXPR, type,
8135 gpr, build_int_cst (type, n_gpr * 8));
8136 TREE_SIDE_EFFECTS (t) = 1;
8137 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8140 if (TARGET_SSE && cfun->va_list_fpr_size)
8142 type = TREE_TYPE (fpr);
8143 t = build2 (MODIFY_EXPR, type, fpr,
8144 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8145 TREE_SIDE_EFFECTS (t) = 1;
8146 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8149 /* Find the overflow area. */
8150 type = TREE_TYPE (ovf);
8151 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8152 ovf_rtx = crtl->args.internal_arg_pointer;
8153 else
8154 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8155 t = make_tree (type, ovf_rtx);
8156 if (words != 0)
8157 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8158 t = build2 (MODIFY_EXPR, type, ovf, t);
8159 TREE_SIDE_EFFECTS (t) = 1;
8160 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8162 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8164 /* Find the register save area.
8165 Prologue of the function save it right above stack frame. */
8166 type = TREE_TYPE (sav);
8167 t = make_tree (type, frame_pointer_rtx);
8168 if (!ix86_varargs_gpr_size)
8169 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8170 t = build2 (MODIFY_EXPR, type, sav, t);
8171 TREE_SIDE_EFFECTS (t) = 1;
8172 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8176 /* Implement va_arg. */
8178 static tree
8179 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8180 gimple_seq *post_p)
8182 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8183 tree f_gpr, f_fpr, f_ovf, f_sav;
8184 tree gpr, fpr, ovf, sav, t;
8185 int size, rsize;
8186 tree lab_false, lab_over = NULL_TREE;
8187 tree addr, t2;
8188 rtx container;
8189 int indirect_p = 0;
8190 tree ptrtype;
8191 enum machine_mode nat_mode;
8192 unsigned int arg_boundary;
8194 /* Only 64bit target needs something special. */
8195 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8196 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8198 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8199 f_fpr = DECL_CHAIN (f_gpr);
8200 f_ovf = DECL_CHAIN (f_fpr);
8201 f_sav = DECL_CHAIN (f_ovf);
8203 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8204 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8205 valist = build_va_arg_indirect_ref (valist);
8206 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8207 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8208 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8210 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8211 if (indirect_p)
8212 type = build_pointer_type (type);
8213 size = int_size_in_bytes (type);
8214 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8216 nat_mode = type_natural_mode (type, NULL);
8217 switch (nat_mode)
8219 case V8SFmode:
8220 case V8SImode:
8221 case V32QImode:
8222 case V16HImode:
8223 case V4DFmode:
8224 case V4DImode:
8225 /* Unnamed 256bit vector mode parameters are passed on stack. */
8226 if (!TARGET_64BIT_MS_ABI)
8228 container = NULL;
8229 break;
8232 default:
8233 container = construct_container (nat_mode, TYPE_MODE (type),
8234 type, 0, X86_64_REGPARM_MAX,
8235 X86_64_SSE_REGPARM_MAX, intreg,
8237 break;
8240 /* Pull the value out of the saved registers. */
8242 addr = create_tmp_var (ptr_type_node, "addr");
8244 if (container)
8246 int needed_intregs, needed_sseregs;
8247 bool need_temp;
8248 tree int_addr, sse_addr;
8250 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8251 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8253 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8255 need_temp = (!REG_P (container)
8256 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8257 || TYPE_ALIGN (type) > 128));
8259 /* In case we are passing structure, verify that it is consecutive block
8260 on the register save area. If not we need to do moves. */
8261 if (!need_temp && !REG_P (container))
8263 /* Verify that all registers are strictly consecutive */
8264 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8266 int i;
8268 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8270 rtx slot = XVECEXP (container, 0, i);
8271 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8272 || INTVAL (XEXP (slot, 1)) != i * 16)
8273 need_temp = 1;
8276 else
8278 int i;
8280 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8282 rtx slot = XVECEXP (container, 0, i);
8283 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8284 || INTVAL (XEXP (slot, 1)) != i * 8)
8285 need_temp = 1;
8289 if (!need_temp)
8291 int_addr = addr;
8292 sse_addr = addr;
8294 else
8296 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8297 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8300 /* First ensure that we fit completely in registers. */
8301 if (needed_intregs)
8303 t = build_int_cst (TREE_TYPE (gpr),
8304 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8305 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8306 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8307 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8308 gimplify_and_add (t, pre_p);
8310 if (needed_sseregs)
8312 t = build_int_cst (TREE_TYPE (fpr),
8313 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8314 + X86_64_REGPARM_MAX * 8);
8315 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8316 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8317 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8318 gimplify_and_add (t, pre_p);
8321 /* Compute index to start of area used for integer regs. */
8322 if (needed_intregs)
8324 /* int_addr = gpr + sav; */
8325 t = fold_build_pointer_plus (sav, gpr);
8326 gimplify_assign (int_addr, t, pre_p);
8328 if (needed_sseregs)
8330 /* sse_addr = fpr + sav; */
8331 t = fold_build_pointer_plus (sav, fpr);
8332 gimplify_assign (sse_addr, t, pre_p);
8334 if (need_temp)
8336 int i, prev_size = 0;
8337 tree temp = create_tmp_var (type, "va_arg_tmp");
8339 /* addr = &temp; */
8340 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8341 gimplify_assign (addr, t, pre_p);
8343 for (i = 0; i < XVECLEN (container, 0); i++)
8345 rtx slot = XVECEXP (container, 0, i);
8346 rtx reg = XEXP (slot, 0);
8347 enum machine_mode mode = GET_MODE (reg);
8348 tree piece_type;
8349 tree addr_type;
8350 tree daddr_type;
8351 tree src_addr, src;
8352 int src_offset;
8353 tree dest_addr, dest;
8354 int cur_size = GET_MODE_SIZE (mode);
8356 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8357 prev_size = INTVAL (XEXP (slot, 1));
8358 if (prev_size + cur_size > size)
8360 cur_size = size - prev_size;
8361 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8362 if (mode == BLKmode)
8363 mode = QImode;
8365 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8366 if (mode == GET_MODE (reg))
8367 addr_type = build_pointer_type (piece_type);
8368 else
8369 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8370 true);
8371 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8372 true);
8374 if (SSE_REGNO_P (REGNO (reg)))
8376 src_addr = sse_addr;
8377 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8379 else
8381 src_addr = int_addr;
8382 src_offset = REGNO (reg) * 8;
8384 src_addr = fold_convert (addr_type, src_addr);
8385 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8387 dest_addr = fold_convert (daddr_type, addr);
8388 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8389 if (cur_size == GET_MODE_SIZE (mode))
8391 src = build_va_arg_indirect_ref (src_addr);
8392 dest = build_va_arg_indirect_ref (dest_addr);
8394 gimplify_assign (dest, src, pre_p);
8396 else
8398 tree copy
8399 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8400 3, dest_addr, src_addr,
8401 size_int (cur_size));
8402 gimplify_and_add (copy, pre_p);
8404 prev_size += cur_size;
8408 if (needed_intregs)
8410 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8411 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8412 gimplify_assign (gpr, t, pre_p);
8415 if (needed_sseregs)
8417 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8418 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8419 gimplify_assign (fpr, t, pre_p);
8422 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8424 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8427 /* ... otherwise out of the overflow area. */
8429 /* When we align parameter on stack for caller, if the parameter
8430 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8431 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8432 here with caller. */
8433 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8434 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8435 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8437 /* Care for on-stack alignment if needed. */
8438 if (arg_boundary <= 64 || size == 0)
8439 t = ovf;
8440 else
8442 HOST_WIDE_INT align = arg_boundary / 8;
8443 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8444 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8445 build_int_cst (TREE_TYPE (t), -align));
8448 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8449 gimplify_assign (addr, t, pre_p);
8451 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8452 gimplify_assign (unshare_expr (ovf), t, pre_p);
8454 if (container)
8455 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8457 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8458 addr = fold_convert (ptrtype, addr);
8460 if (indirect_p)
8461 addr = build_va_arg_indirect_ref (addr);
8462 return build_va_arg_indirect_ref (addr);
8465 /* Return true if OPNUM's MEM should be matched
8466 in movabs* patterns. */
8468 bool
8469 ix86_check_movabs (rtx insn, int opnum)
8471 rtx set, mem;
8473 set = PATTERN (insn);
8474 if (GET_CODE (set) == PARALLEL)
8475 set = XVECEXP (set, 0, 0);
8476 gcc_assert (GET_CODE (set) == SET);
8477 mem = XEXP (set, opnum);
8478 while (GET_CODE (mem) == SUBREG)
8479 mem = SUBREG_REG (mem);
8480 gcc_assert (MEM_P (mem));
8481 return volatile_ok || !MEM_VOLATILE_P (mem);
8484 /* Initialize the table of extra 80387 mathematical constants. */
8486 static void
8487 init_ext_80387_constants (void)
8489 static const char * cst[5] =
8491 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8492 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8493 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8494 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8495 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8497 int i;
8499 for (i = 0; i < 5; i++)
8501 real_from_string (&ext_80387_constants_table[i], cst[i]);
8502 /* Ensure each constant is rounded to XFmode precision. */
8503 real_convert (&ext_80387_constants_table[i],
8504 XFmode, &ext_80387_constants_table[i]);
8507 ext_80387_constants_init = 1;
8510 /* Return non-zero if the constant is something that
8511 can be loaded with a special instruction. */
8514 standard_80387_constant_p (rtx x)
8516 enum machine_mode mode = GET_MODE (x);
8518 REAL_VALUE_TYPE r;
8520 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8521 return -1;
8523 if (x == CONST0_RTX (mode))
8524 return 1;
8525 if (x == CONST1_RTX (mode))
8526 return 2;
8528 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8530 /* For XFmode constants, try to find a special 80387 instruction when
8531 optimizing for size or on those CPUs that benefit from them. */
8532 if (mode == XFmode
8533 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8535 int i;
8537 if (! ext_80387_constants_init)
8538 init_ext_80387_constants ();
8540 for (i = 0; i < 5; i++)
8541 if (real_identical (&r, &ext_80387_constants_table[i]))
8542 return i + 3;
8545 /* Load of the constant -0.0 or -1.0 will be split as
8546 fldz;fchs or fld1;fchs sequence. */
8547 if (real_isnegzero (&r))
8548 return 8;
8549 if (real_identical (&r, &dconstm1))
8550 return 9;
8552 return 0;
8555 /* Return the opcode of the special instruction to be used to load
8556 the constant X. */
8558 const char *
8559 standard_80387_constant_opcode (rtx x)
8561 switch (standard_80387_constant_p (x))
8563 case 1:
8564 return "fldz";
8565 case 2:
8566 return "fld1";
8567 case 3:
8568 return "fldlg2";
8569 case 4:
8570 return "fldln2";
8571 case 5:
8572 return "fldl2e";
8573 case 6:
8574 return "fldl2t";
8575 case 7:
8576 return "fldpi";
8577 case 8:
8578 case 9:
8579 return "#";
8580 default:
8581 gcc_unreachable ();
8585 /* Return the CONST_DOUBLE representing the 80387 constant that is
8586 loaded by the specified special instruction. The argument IDX
8587 matches the return value from standard_80387_constant_p. */
8590 standard_80387_constant_rtx (int idx)
8592 int i;
8594 if (! ext_80387_constants_init)
8595 init_ext_80387_constants ();
8597 switch (idx)
8599 case 3:
8600 case 4:
8601 case 5:
8602 case 6:
8603 case 7:
8604 i = idx - 3;
8605 break;
8607 default:
8608 gcc_unreachable ();
8611 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8612 XFmode);
8615 /* Return 1 if X is all 0s and 2 if x is all 1s
8616 in supported SSE/AVX vector mode. */
8619 standard_sse_constant_p (rtx x)
8621 enum machine_mode mode = GET_MODE (x);
8623 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8624 return 1;
8625 if (vector_all_ones_operand (x, mode))
8626 switch (mode)
8628 case V16QImode:
8629 case V8HImode:
8630 case V4SImode:
8631 case V2DImode:
8632 if (TARGET_SSE2)
8633 return 2;
8634 case V32QImode:
8635 case V16HImode:
8636 case V8SImode:
8637 case V4DImode:
8638 if (TARGET_AVX2)
8639 return 2;
8640 default:
8641 break;
8644 return 0;
8647 /* Return the opcode of the special instruction to be used to load
8648 the constant X. */
8650 const char *
8651 standard_sse_constant_opcode (rtx insn, rtx x)
8653 switch (standard_sse_constant_p (x))
8655 case 1:
8656 switch (get_attr_mode (insn))
8658 case MODE_TI:
8659 return "%vpxor\t%0, %d0";
8660 case MODE_V2DF:
8661 return "%vxorpd\t%0, %d0";
8662 case MODE_V4SF:
8663 return "%vxorps\t%0, %d0";
8665 case MODE_OI:
8666 return "vpxor\t%x0, %x0, %x0";
8667 case MODE_V4DF:
8668 return "vxorpd\t%x0, %x0, %x0";
8669 case MODE_V8SF:
8670 return "vxorps\t%x0, %x0, %x0";
8672 default:
8673 break;
8676 case 2:
8677 if (get_attr_mode (insn) == MODE_XI
8678 || get_attr_mode (insn) == MODE_V8DF
8679 || get_attr_mode (insn) == MODE_V16SF)
8680 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8681 if (TARGET_AVX)
8682 return "vpcmpeqd\t%0, %0, %0";
8683 else
8684 return "pcmpeqd\t%0, %0";
8686 default:
8687 break;
8689 gcc_unreachable ();
8692 /* Returns true if OP contains a symbol reference */
8694 bool
8695 symbolic_reference_mentioned_p (rtx op)
8697 const char *fmt;
8698 int i;
8700 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8701 return true;
8703 fmt = GET_RTX_FORMAT (GET_CODE (op));
8704 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8706 if (fmt[i] == 'E')
8708 int j;
8710 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8711 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8712 return true;
8715 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8716 return true;
8719 return false;
8722 /* Return true if it is appropriate to emit `ret' instructions in the
8723 body of a function. Do this only if the epilogue is simple, needing a
8724 couple of insns. Prior to reloading, we can't tell how many registers
8725 must be saved, so return false then. Return false if there is no frame
8726 marker to de-allocate. */
8728 bool
8729 ix86_can_use_return_insn_p (void)
8731 struct ix86_frame frame;
8733 if (! reload_completed || frame_pointer_needed)
8734 return 0;
8736 /* Don't allow more than 32k pop, since that's all we can do
8737 with one instruction. */
8738 if (crtl->args.pops_args && crtl->args.size >= 32768)
8739 return 0;
8741 ix86_compute_frame_layout (&frame);
8742 return (frame.stack_pointer_offset == UNITS_PER_WORD
8743 && (frame.nregs + frame.nsseregs) == 0);
8746 /* Value should be nonzero if functions must have frame pointers.
8747 Zero means the frame pointer need not be set up (and parms may
8748 be accessed via the stack pointer) in functions that seem suitable. */
8750 static bool
8751 ix86_frame_pointer_required (void)
8753 /* If we accessed previous frames, then the generated code expects
8754 to be able to access the saved ebp value in our frame. */
8755 if (cfun->machine->accesses_prev_frame)
8756 return true;
8758 /* Several x86 os'es need a frame pointer for other reasons,
8759 usually pertaining to setjmp. */
8760 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8761 return true;
8763 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8764 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8765 return true;
8767 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8768 allocation is 4GB. */
8769 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8770 return true;
8772 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8773 turns off the frame pointer by default. Turn it back on now if
8774 we've not got a leaf function. */
8775 if (TARGET_OMIT_LEAF_FRAME_POINTER
8776 && (!crtl->is_leaf
8777 || ix86_current_function_calls_tls_descriptor))
8778 return true;
8780 if (crtl->profile && !flag_fentry)
8781 return true;
8783 return false;
8786 /* Record that the current function accesses previous call frames. */
8788 void
8789 ix86_setup_frame_addresses (void)
8791 cfun->machine->accesses_prev_frame = 1;
8794 #ifndef USE_HIDDEN_LINKONCE
8795 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8796 # define USE_HIDDEN_LINKONCE 1
8797 # else
8798 # define USE_HIDDEN_LINKONCE 0
8799 # endif
8800 #endif
8802 static int pic_labels_used;
8804 /* Fills in the label name that should be used for a pc thunk for
8805 the given register. */
8807 static void
8808 get_pc_thunk_name (char name[32], unsigned int regno)
8810 gcc_assert (!TARGET_64BIT);
8812 if (USE_HIDDEN_LINKONCE)
8813 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8814 else
8815 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8819 /* This function generates code for -fpic that loads %ebx with
8820 the return address of the caller and then returns. */
8822 static void
8823 ix86_code_end (void)
8825 rtx xops[2];
8826 int regno;
8828 for (regno = AX_REG; regno <= SP_REG; regno++)
8830 char name[32];
8831 tree decl;
8833 if (!(pic_labels_used & (1 << regno)))
8834 continue;
8836 get_pc_thunk_name (name, regno);
8838 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8839 get_identifier (name),
8840 build_function_type_list (void_type_node, NULL_TREE));
8841 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8842 NULL_TREE, void_type_node);
8843 TREE_PUBLIC (decl) = 1;
8844 TREE_STATIC (decl) = 1;
8845 DECL_IGNORED_P (decl) = 1;
8847 #if TARGET_MACHO
8848 if (TARGET_MACHO)
8850 switch_to_section (darwin_sections[text_coal_section]);
8851 fputs ("\t.weak_definition\t", asm_out_file);
8852 assemble_name (asm_out_file, name);
8853 fputs ("\n\t.private_extern\t", asm_out_file);
8854 assemble_name (asm_out_file, name);
8855 putc ('\n', asm_out_file);
8856 ASM_OUTPUT_LABEL (asm_out_file, name);
8857 DECL_WEAK (decl) = 1;
8859 else
8860 #endif
8861 if (USE_HIDDEN_LINKONCE)
8863 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8865 targetm.asm_out.unique_section (decl, 0);
8866 switch_to_section (get_named_section (decl, NULL, 0));
8868 targetm.asm_out.globalize_label (asm_out_file, name);
8869 fputs ("\t.hidden\t", asm_out_file);
8870 assemble_name (asm_out_file, name);
8871 putc ('\n', asm_out_file);
8872 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8874 else
8876 switch_to_section (text_section);
8877 ASM_OUTPUT_LABEL (asm_out_file, name);
8880 DECL_INITIAL (decl) = make_node (BLOCK);
8881 current_function_decl = decl;
8882 init_function_start (decl);
8883 first_function_block_is_cold = false;
8884 /* Make sure unwind info is emitted for the thunk if needed. */
8885 final_start_function (emit_barrier (), asm_out_file, 1);
8887 /* Pad stack IP move with 4 instructions (two NOPs count
8888 as one instruction). */
8889 if (TARGET_PAD_SHORT_FUNCTION)
8891 int i = 8;
8893 while (i--)
8894 fputs ("\tnop\n", asm_out_file);
8897 xops[0] = gen_rtx_REG (Pmode, regno);
8898 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8899 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8900 output_asm_insn ("%!ret", NULL);
8901 final_end_function ();
8902 init_insn_lengths ();
8903 free_after_compilation (cfun);
8904 set_cfun (NULL);
8905 current_function_decl = NULL;
8908 if (flag_split_stack)
8909 file_end_indicate_split_stack ();
8912 /* Emit code for the SET_GOT patterns. */
8914 const char *
8915 output_set_got (rtx dest, rtx label)
8917 rtx xops[3];
8919 xops[0] = dest;
8921 if (TARGET_VXWORKS_RTP && flag_pic)
8923 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8924 xops[2] = gen_rtx_MEM (Pmode,
8925 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8926 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8928 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8929 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8930 an unadorned address. */
8931 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8932 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8933 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8934 return "";
8937 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8939 if (!flag_pic)
8941 if (TARGET_MACHO)
8942 /* We don't need a pic base, we're not producing pic. */
8943 gcc_unreachable ();
8945 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8946 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8947 targetm.asm_out.internal_label (asm_out_file, "L",
8948 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8950 else
8952 char name[32];
8953 get_pc_thunk_name (name, REGNO (dest));
8954 pic_labels_used |= 1 << REGNO (dest);
8956 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8957 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8958 output_asm_insn ("%!call\t%X2", xops);
8960 #if TARGET_MACHO
8961 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8962 This is what will be referenced by the Mach-O PIC subsystem. */
8963 if (machopic_should_output_picbase_label () || !label)
8964 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8966 /* When we are restoring the pic base at the site of a nonlocal label,
8967 and we decided to emit the pic base above, we will still output a
8968 local label used for calculating the correction offset (even though
8969 the offset will be 0 in that case). */
8970 if (label)
8971 targetm.asm_out.internal_label (asm_out_file, "L",
8972 CODE_LABEL_NUMBER (label));
8973 #endif
8976 if (!TARGET_MACHO)
8977 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8979 return "";
8982 /* Generate an "push" pattern for input ARG. */
8984 static rtx
8985 gen_push (rtx arg)
8987 struct machine_function *m = cfun->machine;
8989 if (m->fs.cfa_reg == stack_pointer_rtx)
8990 m->fs.cfa_offset += UNITS_PER_WORD;
8991 m->fs.sp_offset += UNITS_PER_WORD;
8993 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8994 arg = gen_rtx_REG (word_mode, REGNO (arg));
8996 return gen_rtx_SET (VOIDmode,
8997 gen_rtx_MEM (word_mode,
8998 gen_rtx_PRE_DEC (Pmode,
8999 stack_pointer_rtx)),
9000 arg);
9003 /* Generate an "pop" pattern for input ARG. */
9005 static rtx
9006 gen_pop (rtx arg)
9008 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9009 arg = gen_rtx_REG (word_mode, REGNO (arg));
9011 return gen_rtx_SET (VOIDmode,
9012 arg,
9013 gen_rtx_MEM (word_mode,
9014 gen_rtx_POST_INC (Pmode,
9015 stack_pointer_rtx)));
9018 /* Return >= 0 if there is an unused call-clobbered register available
9019 for the entire function. */
9021 static unsigned int
9022 ix86_select_alt_pic_regnum (void)
9024 if (crtl->is_leaf
9025 && !crtl->profile
9026 && !ix86_current_function_calls_tls_descriptor)
9028 int i, drap;
9029 /* Can't use the same register for both PIC and DRAP. */
9030 if (crtl->drap_reg)
9031 drap = REGNO (crtl->drap_reg);
9032 else
9033 drap = -1;
9034 for (i = 2; i >= 0; --i)
9035 if (i != drap && !df_regs_ever_live_p (i))
9036 return i;
9039 return INVALID_REGNUM;
9042 /* Return TRUE if we need to save REGNO. */
9044 static bool
9045 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9047 if (pic_offset_table_rtx
9048 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9049 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9050 || crtl->profile
9051 || crtl->calls_eh_return
9052 || crtl->uses_const_pool
9053 || cfun->has_nonlocal_label))
9054 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9056 if (crtl->calls_eh_return && maybe_eh_return)
9058 unsigned i;
9059 for (i = 0; ; i++)
9061 unsigned test = EH_RETURN_DATA_REGNO (i);
9062 if (test == INVALID_REGNUM)
9063 break;
9064 if (test == regno)
9065 return true;
9069 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9070 return true;
9072 return (df_regs_ever_live_p (regno)
9073 && !call_used_regs[regno]
9074 && !fixed_regs[regno]
9075 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9078 /* Return number of saved general prupose registers. */
9080 static int
9081 ix86_nsaved_regs (void)
9083 int nregs = 0;
9084 int regno;
9086 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9087 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9088 nregs ++;
9089 return nregs;
9092 /* Return number of saved SSE registrers. */
9094 static int
9095 ix86_nsaved_sseregs (void)
9097 int nregs = 0;
9098 int regno;
9100 if (!TARGET_64BIT_MS_ABI)
9101 return 0;
9102 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9103 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9104 nregs ++;
9105 return nregs;
9108 /* Given FROM and TO register numbers, say whether this elimination is
9109 allowed. If stack alignment is needed, we can only replace argument
9110 pointer with hard frame pointer, or replace frame pointer with stack
9111 pointer. Otherwise, frame pointer elimination is automatically
9112 handled and all other eliminations are valid. */
9114 static bool
9115 ix86_can_eliminate (const int from, const int to)
9117 if (stack_realign_fp)
9118 return ((from == ARG_POINTER_REGNUM
9119 && to == HARD_FRAME_POINTER_REGNUM)
9120 || (from == FRAME_POINTER_REGNUM
9121 && to == STACK_POINTER_REGNUM));
9122 else
9123 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9126 /* Return the offset between two registers, one to be eliminated, and the other
9127 its replacement, at the start of a routine. */
9129 HOST_WIDE_INT
9130 ix86_initial_elimination_offset (int from, int to)
9132 struct ix86_frame frame;
9133 ix86_compute_frame_layout (&frame);
9135 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9136 return frame.hard_frame_pointer_offset;
9137 else if (from == FRAME_POINTER_REGNUM
9138 && to == HARD_FRAME_POINTER_REGNUM)
9139 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9140 else
9142 gcc_assert (to == STACK_POINTER_REGNUM);
9144 if (from == ARG_POINTER_REGNUM)
9145 return frame.stack_pointer_offset;
9147 gcc_assert (from == FRAME_POINTER_REGNUM);
9148 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9152 /* In a dynamically-aligned function, we can't know the offset from
9153 stack pointer to frame pointer, so we must ensure that setjmp
9154 eliminates fp against the hard fp (%ebp) rather than trying to
9155 index from %esp up to the top of the frame across a gap that is
9156 of unknown (at compile-time) size. */
9157 static rtx
9158 ix86_builtin_setjmp_frame_value (void)
9160 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9163 /* When using -fsplit-stack, the allocation routines set a field in
9164 the TCB to the bottom of the stack plus this much space, measured
9165 in bytes. */
9167 #define SPLIT_STACK_AVAILABLE 256
9169 /* Fill structure ix86_frame about frame of currently computed function. */
9171 static void
9172 ix86_compute_frame_layout (struct ix86_frame *frame)
9174 unsigned HOST_WIDE_INT stack_alignment_needed;
9175 HOST_WIDE_INT offset;
9176 unsigned HOST_WIDE_INT preferred_alignment;
9177 HOST_WIDE_INT size = get_frame_size ();
9178 HOST_WIDE_INT to_allocate;
9180 frame->nregs = ix86_nsaved_regs ();
9181 frame->nsseregs = ix86_nsaved_sseregs ();
9183 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9184 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9186 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9187 function prologues and leaf. */
9188 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9189 && (!crtl->is_leaf || cfun->calls_alloca != 0
9190 || ix86_current_function_calls_tls_descriptor))
9192 preferred_alignment = 16;
9193 stack_alignment_needed = 16;
9194 crtl->preferred_stack_boundary = 128;
9195 crtl->stack_alignment_needed = 128;
9198 gcc_assert (!size || stack_alignment_needed);
9199 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9200 gcc_assert (preferred_alignment <= stack_alignment_needed);
9202 /* For SEH we have to limit the amount of code movement into the prologue.
9203 At present we do this via a BLOCKAGE, at which point there's very little
9204 scheduling that can be done, which means that there's very little point
9205 in doing anything except PUSHs. */
9206 if (TARGET_SEH)
9207 cfun->machine->use_fast_prologue_epilogue = false;
9209 /* During reload iteration the amount of registers saved can change.
9210 Recompute the value as needed. Do not recompute when amount of registers
9211 didn't change as reload does multiple calls to the function and does not
9212 expect the decision to change within single iteration. */
9213 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9214 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9216 int count = frame->nregs;
9217 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9219 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9221 /* The fast prologue uses move instead of push to save registers. This
9222 is significantly longer, but also executes faster as modern hardware
9223 can execute the moves in parallel, but can't do that for push/pop.
9225 Be careful about choosing what prologue to emit: When function takes
9226 many instructions to execute we may use slow version as well as in
9227 case function is known to be outside hot spot (this is known with
9228 feedback only). Weight the size of function by number of registers
9229 to save as it is cheap to use one or two push instructions but very
9230 slow to use many of them. */
9231 if (count)
9232 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9233 if (node->frequency < NODE_FREQUENCY_NORMAL
9234 || (flag_branch_probabilities
9235 && node->frequency < NODE_FREQUENCY_HOT))
9236 cfun->machine->use_fast_prologue_epilogue = false;
9237 else
9238 cfun->machine->use_fast_prologue_epilogue
9239 = !expensive_function_p (count);
9242 frame->save_regs_using_mov
9243 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9244 /* If static stack checking is enabled and done with probes,
9245 the registers need to be saved before allocating the frame. */
9246 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9248 /* Skip return address. */
9249 offset = UNITS_PER_WORD;
9251 /* Skip pushed static chain. */
9252 if (ix86_static_chain_on_stack)
9253 offset += UNITS_PER_WORD;
9255 /* Skip saved base pointer. */
9256 if (frame_pointer_needed)
9257 offset += UNITS_PER_WORD;
9258 frame->hfp_save_offset = offset;
9260 /* The traditional frame pointer location is at the top of the frame. */
9261 frame->hard_frame_pointer_offset = offset;
9263 /* Register save area */
9264 offset += frame->nregs * UNITS_PER_WORD;
9265 frame->reg_save_offset = offset;
9267 /* On SEH target, registers are pushed just before the frame pointer
9268 location. */
9269 if (TARGET_SEH)
9270 frame->hard_frame_pointer_offset = offset;
9272 /* Align and set SSE register save area. */
9273 if (frame->nsseregs)
9275 /* The only ABI that has saved SSE registers (Win64) also has a
9276 16-byte aligned default stack, and thus we don't need to be
9277 within the re-aligned local stack frame to save them. */
9278 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9279 offset = (offset + 16 - 1) & -16;
9280 offset += frame->nsseregs * 16;
9282 frame->sse_reg_save_offset = offset;
9284 /* The re-aligned stack starts here. Values before this point are not
9285 directly comparable with values below this point. In order to make
9286 sure that no value happens to be the same before and after, force
9287 the alignment computation below to add a non-zero value. */
9288 if (stack_realign_fp)
9289 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9291 /* Va-arg area */
9292 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9293 offset += frame->va_arg_size;
9295 /* Align start of frame for local function. */
9296 if (stack_realign_fp
9297 || offset != frame->sse_reg_save_offset
9298 || size != 0
9299 || !crtl->is_leaf
9300 || cfun->calls_alloca
9301 || ix86_current_function_calls_tls_descriptor)
9302 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9304 /* Frame pointer points here. */
9305 frame->frame_pointer_offset = offset;
9307 offset += size;
9309 /* Add outgoing arguments area. Can be skipped if we eliminated
9310 all the function calls as dead code.
9311 Skipping is however impossible when function calls alloca. Alloca
9312 expander assumes that last crtl->outgoing_args_size
9313 of stack frame are unused. */
9314 if (ACCUMULATE_OUTGOING_ARGS
9315 && (!crtl->is_leaf || cfun->calls_alloca
9316 || ix86_current_function_calls_tls_descriptor))
9318 offset += crtl->outgoing_args_size;
9319 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9321 else
9322 frame->outgoing_arguments_size = 0;
9324 /* Align stack boundary. Only needed if we're calling another function
9325 or using alloca. */
9326 if (!crtl->is_leaf || cfun->calls_alloca
9327 || ix86_current_function_calls_tls_descriptor)
9328 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9330 /* We've reached end of stack frame. */
9331 frame->stack_pointer_offset = offset;
9333 /* Size prologue needs to allocate. */
9334 to_allocate = offset - frame->sse_reg_save_offset;
9336 if ((!to_allocate && frame->nregs <= 1)
9337 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9338 frame->save_regs_using_mov = false;
9340 if (ix86_using_red_zone ()
9341 && crtl->sp_is_unchanging
9342 && crtl->is_leaf
9343 && !ix86_current_function_calls_tls_descriptor)
9345 frame->red_zone_size = to_allocate;
9346 if (frame->save_regs_using_mov)
9347 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9348 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9349 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9351 else
9352 frame->red_zone_size = 0;
9353 frame->stack_pointer_offset -= frame->red_zone_size;
9355 /* The SEH frame pointer location is near the bottom of the frame.
9356 This is enforced by the fact that the difference between the
9357 stack pointer and the frame pointer is limited to 240 bytes in
9358 the unwind data structure. */
9359 if (TARGET_SEH)
9361 HOST_WIDE_INT diff;
9363 /* If we can leave the frame pointer where it is, do so. Also, returns
9364 the establisher frame for __builtin_frame_address (0). */
9365 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9366 if (diff <= SEH_MAX_FRAME_SIZE
9367 && (diff > 240 || (diff & 15) != 0)
9368 && !crtl->accesses_prior_frames)
9370 /* Ideally we'd determine what portion of the local stack frame
9371 (within the constraint of the lowest 240) is most heavily used.
9372 But without that complication, simply bias the frame pointer
9373 by 128 bytes so as to maximize the amount of the local stack
9374 frame that is addressable with 8-bit offsets. */
9375 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9380 /* This is semi-inlined memory_address_length, but simplified
9381 since we know that we're always dealing with reg+offset, and
9382 to avoid having to create and discard all that rtl. */
9384 static inline int
9385 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9387 int len = 4;
9389 if (offset == 0)
9391 /* EBP and R13 cannot be encoded without an offset. */
9392 len = (regno == BP_REG || regno == R13_REG);
9394 else if (IN_RANGE (offset, -128, 127))
9395 len = 1;
9397 /* ESP and R12 must be encoded with a SIB byte. */
9398 if (regno == SP_REG || regno == R12_REG)
9399 len++;
9401 return len;
9404 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9405 The valid base registers are taken from CFUN->MACHINE->FS. */
9407 static rtx
9408 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9410 const struct machine_function *m = cfun->machine;
9411 rtx base_reg = NULL;
9412 HOST_WIDE_INT base_offset = 0;
9414 if (m->use_fast_prologue_epilogue)
9416 /* Choose the base register most likely to allow the most scheduling
9417 opportunities. Generally FP is valid throughout the function,
9418 while DRAP must be reloaded within the epilogue. But choose either
9419 over the SP due to increased encoding size. */
9421 if (m->fs.fp_valid)
9423 base_reg = hard_frame_pointer_rtx;
9424 base_offset = m->fs.fp_offset - cfa_offset;
9426 else if (m->fs.drap_valid)
9428 base_reg = crtl->drap_reg;
9429 base_offset = 0 - cfa_offset;
9431 else if (m->fs.sp_valid)
9433 base_reg = stack_pointer_rtx;
9434 base_offset = m->fs.sp_offset - cfa_offset;
9437 else
9439 HOST_WIDE_INT toffset;
9440 int len = 16, tlen;
9442 /* Choose the base register with the smallest address encoding.
9443 With a tie, choose FP > DRAP > SP. */
9444 if (m->fs.sp_valid)
9446 base_reg = stack_pointer_rtx;
9447 base_offset = m->fs.sp_offset - cfa_offset;
9448 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9450 if (m->fs.drap_valid)
9452 toffset = 0 - cfa_offset;
9453 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9454 if (tlen <= len)
9456 base_reg = crtl->drap_reg;
9457 base_offset = toffset;
9458 len = tlen;
9461 if (m->fs.fp_valid)
9463 toffset = m->fs.fp_offset - cfa_offset;
9464 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9465 if (tlen <= len)
9467 base_reg = hard_frame_pointer_rtx;
9468 base_offset = toffset;
9469 len = tlen;
9473 gcc_assert (base_reg != NULL);
9475 return plus_constant (Pmode, base_reg, base_offset);
9478 /* Emit code to save registers in the prologue. */
9480 static void
9481 ix86_emit_save_regs (void)
9483 unsigned int regno;
9484 rtx insn;
9486 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9487 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9489 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9490 RTX_FRAME_RELATED_P (insn) = 1;
9494 /* Emit a single register save at CFA - CFA_OFFSET. */
9496 static void
9497 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9498 HOST_WIDE_INT cfa_offset)
9500 struct machine_function *m = cfun->machine;
9501 rtx reg = gen_rtx_REG (mode, regno);
9502 rtx mem, addr, base, insn;
9504 addr = choose_baseaddr (cfa_offset);
9505 mem = gen_frame_mem (mode, addr);
9507 /* For SSE saves, we need to indicate the 128-bit alignment. */
9508 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9510 insn = emit_move_insn (mem, reg);
9511 RTX_FRAME_RELATED_P (insn) = 1;
9513 base = addr;
9514 if (GET_CODE (base) == PLUS)
9515 base = XEXP (base, 0);
9516 gcc_checking_assert (REG_P (base));
9518 /* When saving registers into a re-aligned local stack frame, avoid
9519 any tricky guessing by dwarf2out. */
9520 if (m->fs.realigned)
9522 gcc_checking_assert (stack_realign_drap);
9524 if (regno == REGNO (crtl->drap_reg))
9526 /* A bit of a hack. We force the DRAP register to be saved in
9527 the re-aligned stack frame, which provides us with a copy
9528 of the CFA that will last past the prologue. Install it. */
9529 gcc_checking_assert (cfun->machine->fs.fp_valid);
9530 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9531 cfun->machine->fs.fp_offset - cfa_offset);
9532 mem = gen_rtx_MEM (mode, addr);
9533 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9535 else
9537 /* The frame pointer is a stable reference within the
9538 aligned frame. Use it. */
9539 gcc_checking_assert (cfun->machine->fs.fp_valid);
9540 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9541 cfun->machine->fs.fp_offset - cfa_offset);
9542 mem = gen_rtx_MEM (mode, addr);
9543 add_reg_note (insn, REG_CFA_EXPRESSION,
9544 gen_rtx_SET (VOIDmode, mem, reg));
9548 /* The memory may not be relative to the current CFA register,
9549 which means that we may need to generate a new pattern for
9550 use by the unwind info. */
9551 else if (base != m->fs.cfa_reg)
9553 addr = plus_constant (Pmode, m->fs.cfa_reg,
9554 m->fs.cfa_offset - cfa_offset);
9555 mem = gen_rtx_MEM (mode, addr);
9556 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9560 /* Emit code to save registers using MOV insns.
9561 First register is stored at CFA - CFA_OFFSET. */
9562 static void
9563 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9565 unsigned int regno;
9567 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9568 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9570 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9571 cfa_offset -= UNITS_PER_WORD;
9575 /* Emit code to save SSE registers using MOV insns.
9576 First register is stored at CFA - CFA_OFFSET. */
9577 static void
9578 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9580 unsigned int regno;
9582 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9583 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9585 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9586 cfa_offset -= 16;
9590 static GTY(()) rtx queued_cfa_restores;
9592 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9593 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9594 Don't add the note if the previously saved value will be left untouched
9595 within stack red-zone till return, as unwinders can find the same value
9596 in the register and on the stack. */
9598 static void
9599 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9601 if (!crtl->shrink_wrapped
9602 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9603 return;
9605 if (insn)
9607 add_reg_note (insn, REG_CFA_RESTORE, reg);
9608 RTX_FRAME_RELATED_P (insn) = 1;
9610 else
9611 queued_cfa_restores
9612 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9615 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9617 static void
9618 ix86_add_queued_cfa_restore_notes (rtx insn)
9620 rtx last;
9621 if (!queued_cfa_restores)
9622 return;
9623 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9625 XEXP (last, 1) = REG_NOTES (insn);
9626 REG_NOTES (insn) = queued_cfa_restores;
9627 queued_cfa_restores = NULL_RTX;
9628 RTX_FRAME_RELATED_P (insn) = 1;
9631 /* Expand prologue or epilogue stack adjustment.
9632 The pattern exist to put a dependency on all ebp-based memory accesses.
9633 STYLE should be negative if instructions should be marked as frame related,
9634 zero if %r11 register is live and cannot be freely used and positive
9635 otherwise. */
9637 static void
9638 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9639 int style, bool set_cfa)
9641 struct machine_function *m = cfun->machine;
9642 rtx insn;
9643 bool add_frame_related_expr = false;
9645 if (Pmode == SImode)
9646 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9647 else if (x86_64_immediate_operand (offset, DImode))
9648 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9649 else
9651 rtx tmp;
9652 /* r11 is used by indirect sibcall return as well, set before the
9653 epilogue and used after the epilogue. */
9654 if (style)
9655 tmp = gen_rtx_REG (DImode, R11_REG);
9656 else
9658 gcc_assert (src != hard_frame_pointer_rtx
9659 && dest != hard_frame_pointer_rtx);
9660 tmp = hard_frame_pointer_rtx;
9662 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9663 if (style < 0)
9664 add_frame_related_expr = true;
9666 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9669 insn = emit_insn (insn);
9670 if (style >= 0)
9671 ix86_add_queued_cfa_restore_notes (insn);
9673 if (set_cfa)
9675 rtx r;
9677 gcc_assert (m->fs.cfa_reg == src);
9678 m->fs.cfa_offset += INTVAL (offset);
9679 m->fs.cfa_reg = dest;
9681 r = gen_rtx_PLUS (Pmode, src, offset);
9682 r = gen_rtx_SET (VOIDmode, dest, r);
9683 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9684 RTX_FRAME_RELATED_P (insn) = 1;
9686 else if (style < 0)
9688 RTX_FRAME_RELATED_P (insn) = 1;
9689 if (add_frame_related_expr)
9691 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9692 r = gen_rtx_SET (VOIDmode, dest, r);
9693 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9697 if (dest == stack_pointer_rtx)
9699 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9700 bool valid = m->fs.sp_valid;
9702 if (src == hard_frame_pointer_rtx)
9704 valid = m->fs.fp_valid;
9705 ooffset = m->fs.fp_offset;
9707 else if (src == crtl->drap_reg)
9709 valid = m->fs.drap_valid;
9710 ooffset = 0;
9712 else
9714 /* Else there are two possibilities: SP itself, which we set
9715 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9716 taken care of this by hand along the eh_return path. */
9717 gcc_checking_assert (src == stack_pointer_rtx
9718 || offset == const0_rtx);
9721 m->fs.sp_offset = ooffset - INTVAL (offset);
9722 m->fs.sp_valid = valid;
9726 /* Find an available register to be used as dynamic realign argument
9727 pointer regsiter. Such a register will be written in prologue and
9728 used in begin of body, so it must not be
9729 1. parameter passing register.
9730 2. GOT pointer.
9731 We reuse static-chain register if it is available. Otherwise, we
9732 use DI for i386 and R13 for x86-64. We chose R13 since it has
9733 shorter encoding.
9735 Return: the regno of chosen register. */
9737 static unsigned int
9738 find_drap_reg (void)
9740 tree decl = cfun->decl;
9742 if (TARGET_64BIT)
9744 /* Use R13 for nested function or function need static chain.
9745 Since function with tail call may use any caller-saved
9746 registers in epilogue, DRAP must not use caller-saved
9747 register in such case. */
9748 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9749 return R13_REG;
9751 return R10_REG;
9753 else
9755 /* Use DI for nested function or function need static chain.
9756 Since function with tail call may use any caller-saved
9757 registers in epilogue, DRAP must not use caller-saved
9758 register in such case. */
9759 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9760 return DI_REG;
9762 /* Reuse static chain register if it isn't used for parameter
9763 passing. */
9764 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9766 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9767 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9768 return CX_REG;
9770 return DI_REG;
9774 /* Return minimum incoming stack alignment. */
9776 static unsigned int
9777 ix86_minimum_incoming_stack_boundary (bool sibcall)
9779 unsigned int incoming_stack_boundary;
9781 /* Prefer the one specified at command line. */
9782 if (ix86_user_incoming_stack_boundary)
9783 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9784 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9785 if -mstackrealign is used, it isn't used for sibcall check and
9786 estimated stack alignment is 128bit. */
9787 else if (!sibcall
9788 && !TARGET_64BIT
9789 && ix86_force_align_arg_pointer
9790 && crtl->stack_alignment_estimated == 128)
9791 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9792 else
9793 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9795 /* Incoming stack alignment can be changed on individual functions
9796 via force_align_arg_pointer attribute. We use the smallest
9797 incoming stack boundary. */
9798 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9799 && lookup_attribute (ix86_force_align_arg_pointer_string,
9800 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9801 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9803 /* The incoming stack frame has to be aligned at least at
9804 parm_stack_boundary. */
9805 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9806 incoming_stack_boundary = crtl->parm_stack_boundary;
9808 /* Stack at entrance of main is aligned by runtime. We use the
9809 smallest incoming stack boundary. */
9810 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9811 && DECL_NAME (current_function_decl)
9812 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9813 && DECL_FILE_SCOPE_P (current_function_decl))
9814 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9816 return incoming_stack_boundary;
9819 /* Update incoming stack boundary and estimated stack alignment. */
9821 static void
9822 ix86_update_stack_boundary (void)
9824 ix86_incoming_stack_boundary
9825 = ix86_minimum_incoming_stack_boundary (false);
9827 /* x86_64 vararg needs 16byte stack alignment for register save
9828 area. */
9829 if (TARGET_64BIT
9830 && cfun->stdarg
9831 && crtl->stack_alignment_estimated < 128)
9832 crtl->stack_alignment_estimated = 128;
9835 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9836 needed or an rtx for DRAP otherwise. */
9838 static rtx
9839 ix86_get_drap_rtx (void)
9841 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9842 crtl->need_drap = true;
9844 if (stack_realign_drap)
9846 /* Assign DRAP to vDRAP and returns vDRAP */
9847 unsigned int regno = find_drap_reg ();
9848 rtx drap_vreg;
9849 rtx arg_ptr;
9850 rtx seq, insn;
9852 arg_ptr = gen_rtx_REG (Pmode, regno);
9853 crtl->drap_reg = arg_ptr;
9855 start_sequence ();
9856 drap_vreg = copy_to_reg (arg_ptr);
9857 seq = get_insns ();
9858 end_sequence ();
9860 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9861 if (!optimize)
9863 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9864 RTX_FRAME_RELATED_P (insn) = 1;
9866 return drap_vreg;
9868 else
9869 return NULL;
9872 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9874 static rtx
9875 ix86_internal_arg_pointer (void)
9877 return virtual_incoming_args_rtx;
9880 struct scratch_reg {
9881 rtx reg;
9882 bool saved;
9885 /* Return a short-lived scratch register for use on function entry.
9886 In 32-bit mode, it is valid only after the registers are saved
9887 in the prologue. This register must be released by means of
9888 release_scratch_register_on_entry once it is dead. */
9890 static void
9891 get_scratch_register_on_entry (struct scratch_reg *sr)
9893 int regno;
9895 sr->saved = false;
9897 if (TARGET_64BIT)
9899 /* We always use R11 in 64-bit mode. */
9900 regno = R11_REG;
9902 else
9904 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9905 bool fastcall_p
9906 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9907 bool thiscall_p
9908 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9909 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9910 int regparm = ix86_function_regparm (fntype, decl);
9911 int drap_regno
9912 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9914 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9915 for the static chain register. */
9916 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9917 && drap_regno != AX_REG)
9918 regno = AX_REG;
9919 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9920 for the static chain register. */
9921 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9922 regno = AX_REG;
9923 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9924 regno = DX_REG;
9925 /* ecx is the static chain register. */
9926 else if (regparm < 3 && !fastcall_p && !thiscall_p
9927 && !static_chain_p
9928 && drap_regno != CX_REG)
9929 regno = CX_REG;
9930 else if (ix86_save_reg (BX_REG, true))
9931 regno = BX_REG;
9932 /* esi is the static chain register. */
9933 else if (!(regparm == 3 && static_chain_p)
9934 && ix86_save_reg (SI_REG, true))
9935 regno = SI_REG;
9936 else if (ix86_save_reg (DI_REG, true))
9937 regno = DI_REG;
9938 else
9940 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9941 sr->saved = true;
9945 sr->reg = gen_rtx_REG (Pmode, regno);
9946 if (sr->saved)
9948 rtx insn = emit_insn (gen_push (sr->reg));
9949 RTX_FRAME_RELATED_P (insn) = 1;
9953 /* Release a scratch register obtained from the preceding function. */
9955 static void
9956 release_scratch_register_on_entry (struct scratch_reg *sr)
9958 if (sr->saved)
9960 struct machine_function *m = cfun->machine;
9961 rtx x, insn = emit_insn (gen_pop (sr->reg));
9963 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9964 RTX_FRAME_RELATED_P (insn) = 1;
9965 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9966 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9967 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9968 m->fs.sp_offset -= UNITS_PER_WORD;
9972 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9974 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9976 static void
9977 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9979 /* We skip the probe for the first interval + a small dope of 4 words and
9980 probe that many bytes past the specified size to maintain a protection
9981 area at the botton of the stack. */
9982 const int dope = 4 * UNITS_PER_WORD;
9983 rtx size_rtx = GEN_INT (size), last;
9985 /* See if we have a constant small number of probes to generate. If so,
9986 that's the easy case. The run-time loop is made up of 11 insns in the
9987 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9988 for n # of intervals. */
9989 if (size <= 5 * PROBE_INTERVAL)
9991 HOST_WIDE_INT i, adjust;
9992 bool first_probe = true;
9994 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9995 values of N from 1 until it exceeds SIZE. If only one probe is
9996 needed, this will not generate any code. Then adjust and probe
9997 to PROBE_INTERVAL + SIZE. */
9998 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10000 if (first_probe)
10002 adjust = 2 * PROBE_INTERVAL + dope;
10003 first_probe = false;
10005 else
10006 adjust = PROBE_INTERVAL;
10008 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10009 plus_constant (Pmode, stack_pointer_rtx,
10010 -adjust)));
10011 emit_stack_probe (stack_pointer_rtx);
10014 if (first_probe)
10015 adjust = size + PROBE_INTERVAL + dope;
10016 else
10017 adjust = size + PROBE_INTERVAL - i;
10019 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10020 plus_constant (Pmode, stack_pointer_rtx,
10021 -adjust)));
10022 emit_stack_probe (stack_pointer_rtx);
10024 /* Adjust back to account for the additional first interval. */
10025 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10026 plus_constant (Pmode, stack_pointer_rtx,
10027 PROBE_INTERVAL + dope)));
10030 /* Otherwise, do the same as above, but in a loop. Note that we must be
10031 extra careful with variables wrapping around because we might be at
10032 the very top (or the very bottom) of the address space and we have
10033 to be able to handle this case properly; in particular, we use an
10034 equality test for the loop condition. */
10035 else
10037 HOST_WIDE_INT rounded_size;
10038 struct scratch_reg sr;
10040 get_scratch_register_on_entry (&sr);
10043 /* Step 1: round SIZE to the previous multiple of the interval. */
10045 rounded_size = size & -PROBE_INTERVAL;
10048 /* Step 2: compute initial and final value of the loop counter. */
10050 /* SP = SP_0 + PROBE_INTERVAL. */
10051 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10052 plus_constant (Pmode, stack_pointer_rtx,
10053 - (PROBE_INTERVAL + dope))));
10055 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10056 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10057 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10058 gen_rtx_PLUS (Pmode, sr.reg,
10059 stack_pointer_rtx)));
10062 /* Step 3: the loop
10064 while (SP != LAST_ADDR)
10066 SP = SP + PROBE_INTERVAL
10067 probe at SP
10070 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10071 values of N from 1 until it is equal to ROUNDED_SIZE. */
10073 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10076 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10077 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10079 if (size != rounded_size)
10081 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10082 plus_constant (Pmode, stack_pointer_rtx,
10083 rounded_size - size)));
10084 emit_stack_probe (stack_pointer_rtx);
10087 /* Adjust back to account for the additional first interval. */
10088 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10089 plus_constant (Pmode, stack_pointer_rtx,
10090 PROBE_INTERVAL + dope)));
10092 release_scratch_register_on_entry (&sr);
10095 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10097 /* Even if the stack pointer isn't the CFA register, we need to correctly
10098 describe the adjustments made to it, in particular differentiate the
10099 frame-related ones from the frame-unrelated ones. */
10100 if (size > 0)
10102 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10103 XVECEXP (expr, 0, 0)
10104 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10105 plus_constant (Pmode, stack_pointer_rtx, -size));
10106 XVECEXP (expr, 0, 1)
10107 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10108 plus_constant (Pmode, stack_pointer_rtx,
10109 PROBE_INTERVAL + dope + size));
10110 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10111 RTX_FRAME_RELATED_P (last) = 1;
10113 cfun->machine->fs.sp_offset += size;
10116 /* Make sure nothing is scheduled before we are done. */
10117 emit_insn (gen_blockage ());
10120 /* Adjust the stack pointer up to REG while probing it. */
10122 const char *
10123 output_adjust_stack_and_probe (rtx reg)
10125 static int labelno = 0;
10126 char loop_lab[32], end_lab[32];
10127 rtx xops[2];
10129 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10130 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10132 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10134 /* Jump to END_LAB if SP == LAST_ADDR. */
10135 xops[0] = stack_pointer_rtx;
10136 xops[1] = reg;
10137 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10138 fputs ("\tje\t", asm_out_file);
10139 assemble_name_raw (asm_out_file, end_lab);
10140 fputc ('\n', asm_out_file);
10142 /* SP = SP + PROBE_INTERVAL. */
10143 xops[1] = GEN_INT (PROBE_INTERVAL);
10144 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10146 /* Probe at SP. */
10147 xops[1] = const0_rtx;
10148 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10150 fprintf (asm_out_file, "\tjmp\t");
10151 assemble_name_raw (asm_out_file, loop_lab);
10152 fputc ('\n', asm_out_file);
10154 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10156 return "";
10159 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10160 inclusive. These are offsets from the current stack pointer. */
10162 static void
10163 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10165 /* See if we have a constant small number of probes to generate. If so,
10166 that's the easy case. The run-time loop is made up of 7 insns in the
10167 generic case while the compile-time loop is made up of n insns for n #
10168 of intervals. */
10169 if (size <= 7 * PROBE_INTERVAL)
10171 HOST_WIDE_INT i;
10173 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10174 it exceeds SIZE. If only one probe is needed, this will not
10175 generate any code. Then probe at FIRST + SIZE. */
10176 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10177 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10178 -(first + i)));
10180 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10181 -(first + size)));
10184 /* Otherwise, do the same as above, but in a loop. Note that we must be
10185 extra careful with variables wrapping around because we might be at
10186 the very top (or the very bottom) of the address space and we have
10187 to be able to handle this case properly; in particular, we use an
10188 equality test for the loop condition. */
10189 else
10191 HOST_WIDE_INT rounded_size, last;
10192 struct scratch_reg sr;
10194 get_scratch_register_on_entry (&sr);
10197 /* Step 1: round SIZE to the previous multiple of the interval. */
10199 rounded_size = size & -PROBE_INTERVAL;
10202 /* Step 2: compute initial and final value of the loop counter. */
10204 /* TEST_OFFSET = FIRST. */
10205 emit_move_insn (sr.reg, GEN_INT (-first));
10207 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10208 last = first + rounded_size;
10211 /* Step 3: the loop
10213 while (TEST_ADDR != LAST_ADDR)
10215 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10216 probe at TEST_ADDR
10219 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10220 until it is equal to ROUNDED_SIZE. */
10222 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10225 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10226 that SIZE is equal to ROUNDED_SIZE. */
10228 if (size != rounded_size)
10229 emit_stack_probe (plus_constant (Pmode,
10230 gen_rtx_PLUS (Pmode,
10231 stack_pointer_rtx,
10232 sr.reg),
10233 rounded_size - size));
10235 release_scratch_register_on_entry (&sr);
10238 /* Make sure nothing is scheduled before we are done. */
10239 emit_insn (gen_blockage ());
10242 /* Probe a range of stack addresses from REG to END, inclusive. These are
10243 offsets from the current stack pointer. */
10245 const char *
10246 output_probe_stack_range (rtx reg, rtx end)
10248 static int labelno = 0;
10249 char loop_lab[32], end_lab[32];
10250 rtx xops[3];
10252 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10253 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10255 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10257 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10258 xops[0] = reg;
10259 xops[1] = end;
10260 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10261 fputs ("\tje\t", asm_out_file);
10262 assemble_name_raw (asm_out_file, end_lab);
10263 fputc ('\n', asm_out_file);
10265 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10266 xops[1] = GEN_INT (PROBE_INTERVAL);
10267 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10269 /* Probe at TEST_ADDR. */
10270 xops[0] = stack_pointer_rtx;
10271 xops[1] = reg;
10272 xops[2] = const0_rtx;
10273 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10275 fprintf (asm_out_file, "\tjmp\t");
10276 assemble_name_raw (asm_out_file, loop_lab);
10277 fputc ('\n', asm_out_file);
10279 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10281 return "";
10284 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10285 to be generated in correct form. */
10286 static void
10287 ix86_finalize_stack_realign_flags (void)
10289 /* Check if stack realign is really needed after reload, and
10290 stores result in cfun */
10291 unsigned int incoming_stack_boundary
10292 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10293 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10294 unsigned int stack_realign = (incoming_stack_boundary
10295 < (crtl->is_leaf
10296 ? crtl->max_used_stack_slot_alignment
10297 : crtl->stack_alignment_needed));
10299 if (crtl->stack_realign_finalized)
10301 /* After stack_realign_needed is finalized, we can't no longer
10302 change it. */
10303 gcc_assert (crtl->stack_realign_needed == stack_realign);
10304 return;
10307 /* If the only reason for frame_pointer_needed is that we conservatively
10308 assumed stack realignment might be needed, but in the end nothing that
10309 needed the stack alignment had been spilled, clear frame_pointer_needed
10310 and say we don't need stack realignment. */
10311 if (stack_realign
10312 && !crtl->need_drap
10313 && frame_pointer_needed
10314 && crtl->is_leaf
10315 && flag_omit_frame_pointer
10316 && crtl->sp_is_unchanging
10317 && !ix86_current_function_calls_tls_descriptor
10318 && !crtl->accesses_prior_frames
10319 && !cfun->calls_alloca
10320 && !crtl->calls_eh_return
10321 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10322 && !ix86_frame_pointer_required ()
10323 && get_frame_size () == 0
10324 && ix86_nsaved_sseregs () == 0
10325 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10327 HARD_REG_SET set_up_by_prologue, prologue_used;
10328 basic_block bb;
10330 CLEAR_HARD_REG_SET (prologue_used);
10331 CLEAR_HARD_REG_SET (set_up_by_prologue);
10332 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10333 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10334 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10335 HARD_FRAME_POINTER_REGNUM);
10336 FOR_EACH_BB (bb)
10338 rtx insn;
10339 FOR_BB_INSNS (bb, insn)
10340 if (NONDEBUG_INSN_P (insn)
10341 && requires_stack_frame_p (insn, prologue_used,
10342 set_up_by_prologue))
10344 crtl->stack_realign_needed = stack_realign;
10345 crtl->stack_realign_finalized = true;
10346 return;
10350 frame_pointer_needed = false;
10351 stack_realign = false;
10352 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10353 crtl->stack_alignment_needed = incoming_stack_boundary;
10354 crtl->stack_alignment_estimated = incoming_stack_boundary;
10355 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10356 crtl->preferred_stack_boundary = incoming_stack_boundary;
10357 df_finish_pass (true);
10358 df_scan_alloc (NULL);
10359 df_scan_blocks ();
10360 df_compute_regs_ever_live (true);
10361 df_analyze ();
10364 crtl->stack_realign_needed = stack_realign;
10365 crtl->stack_realign_finalized = true;
10368 /* Expand the prologue into a bunch of separate insns. */
10370 void
10371 ix86_expand_prologue (void)
10373 struct machine_function *m = cfun->machine;
10374 rtx insn, t;
10375 bool pic_reg_used;
10376 struct ix86_frame frame;
10377 HOST_WIDE_INT allocate;
10378 bool int_registers_saved;
10379 bool sse_registers_saved;
10381 ix86_finalize_stack_realign_flags ();
10383 /* DRAP should not coexist with stack_realign_fp */
10384 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10386 memset (&m->fs, 0, sizeof (m->fs));
10388 /* Initialize CFA state for before the prologue. */
10389 m->fs.cfa_reg = stack_pointer_rtx;
10390 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10392 /* Track SP offset to the CFA. We continue tracking this after we've
10393 swapped the CFA register away from SP. In the case of re-alignment
10394 this is fudged; we're interested to offsets within the local frame. */
10395 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10396 m->fs.sp_valid = true;
10398 ix86_compute_frame_layout (&frame);
10400 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10402 /* We should have already generated an error for any use of
10403 ms_hook on a nested function. */
10404 gcc_checking_assert (!ix86_static_chain_on_stack);
10406 /* Check if profiling is active and we shall use profiling before
10407 prologue variant. If so sorry. */
10408 if (crtl->profile && flag_fentry != 0)
10409 sorry ("ms_hook_prologue attribute isn%'t compatible "
10410 "with -mfentry for 32-bit");
10412 /* In ix86_asm_output_function_label we emitted:
10413 8b ff movl.s %edi,%edi
10414 55 push %ebp
10415 8b ec movl.s %esp,%ebp
10417 This matches the hookable function prologue in Win32 API
10418 functions in Microsoft Windows XP Service Pack 2 and newer.
10419 Wine uses this to enable Windows apps to hook the Win32 API
10420 functions provided by Wine.
10422 What that means is that we've already set up the frame pointer. */
10424 if (frame_pointer_needed
10425 && !(crtl->drap_reg && crtl->stack_realign_needed))
10427 rtx push, mov;
10429 /* We've decided to use the frame pointer already set up.
10430 Describe this to the unwinder by pretending that both
10431 push and mov insns happen right here.
10433 Putting the unwind info here at the end of the ms_hook
10434 is done so that we can make absolutely certain we get
10435 the required byte sequence at the start of the function,
10436 rather than relying on an assembler that can produce
10437 the exact encoding required.
10439 However it does mean (in the unpatched case) that we have
10440 a 1 insn window where the asynchronous unwind info is
10441 incorrect. However, if we placed the unwind info at
10442 its correct location we would have incorrect unwind info
10443 in the patched case. Which is probably all moot since
10444 I don't expect Wine generates dwarf2 unwind info for the
10445 system libraries that use this feature. */
10447 insn = emit_insn (gen_blockage ());
10449 push = gen_push (hard_frame_pointer_rtx);
10450 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10451 stack_pointer_rtx);
10452 RTX_FRAME_RELATED_P (push) = 1;
10453 RTX_FRAME_RELATED_P (mov) = 1;
10455 RTX_FRAME_RELATED_P (insn) = 1;
10456 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10457 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10459 /* Note that gen_push incremented m->fs.cfa_offset, even
10460 though we didn't emit the push insn here. */
10461 m->fs.cfa_reg = hard_frame_pointer_rtx;
10462 m->fs.fp_offset = m->fs.cfa_offset;
10463 m->fs.fp_valid = true;
10465 else
10467 /* The frame pointer is not needed so pop %ebp again.
10468 This leaves us with a pristine state. */
10469 emit_insn (gen_pop (hard_frame_pointer_rtx));
10473 /* The first insn of a function that accepts its static chain on the
10474 stack is to push the register that would be filled in by a direct
10475 call. This insn will be skipped by the trampoline. */
10476 else if (ix86_static_chain_on_stack)
10478 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10479 emit_insn (gen_blockage ());
10481 /* We don't want to interpret this push insn as a register save,
10482 only as a stack adjustment. The real copy of the register as
10483 a save will be done later, if needed. */
10484 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10485 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10486 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10487 RTX_FRAME_RELATED_P (insn) = 1;
10490 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10491 of DRAP is needed and stack realignment is really needed after reload */
10492 if (stack_realign_drap)
10494 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10496 /* Only need to push parameter pointer reg if it is caller saved. */
10497 if (!call_used_regs[REGNO (crtl->drap_reg)])
10499 /* Push arg pointer reg */
10500 insn = emit_insn (gen_push (crtl->drap_reg));
10501 RTX_FRAME_RELATED_P (insn) = 1;
10504 /* Grab the argument pointer. */
10505 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10506 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10507 RTX_FRAME_RELATED_P (insn) = 1;
10508 m->fs.cfa_reg = crtl->drap_reg;
10509 m->fs.cfa_offset = 0;
10511 /* Align the stack. */
10512 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10513 stack_pointer_rtx,
10514 GEN_INT (-align_bytes)));
10515 RTX_FRAME_RELATED_P (insn) = 1;
10517 /* Replicate the return address on the stack so that return
10518 address can be reached via (argp - 1) slot. This is needed
10519 to implement macro RETURN_ADDR_RTX and intrinsic function
10520 expand_builtin_return_addr etc. */
10521 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10522 t = gen_frame_mem (word_mode, t);
10523 insn = emit_insn (gen_push (t));
10524 RTX_FRAME_RELATED_P (insn) = 1;
10526 /* For the purposes of frame and register save area addressing,
10527 we've started over with a new frame. */
10528 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10529 m->fs.realigned = true;
10532 int_registers_saved = (frame.nregs == 0);
10533 sse_registers_saved = (frame.nsseregs == 0);
10535 if (frame_pointer_needed && !m->fs.fp_valid)
10537 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10538 slower on all targets. Also sdb doesn't like it. */
10539 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10540 RTX_FRAME_RELATED_P (insn) = 1;
10542 /* Push registers now, before setting the frame pointer
10543 on SEH target. */
10544 if (!int_registers_saved
10545 && TARGET_SEH
10546 && !frame.save_regs_using_mov)
10548 ix86_emit_save_regs ();
10549 int_registers_saved = true;
10550 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10553 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10555 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10556 RTX_FRAME_RELATED_P (insn) = 1;
10558 if (m->fs.cfa_reg == stack_pointer_rtx)
10559 m->fs.cfa_reg = hard_frame_pointer_rtx;
10560 m->fs.fp_offset = m->fs.sp_offset;
10561 m->fs.fp_valid = true;
10565 if (!int_registers_saved)
10567 /* If saving registers via PUSH, do so now. */
10568 if (!frame.save_regs_using_mov)
10570 ix86_emit_save_regs ();
10571 int_registers_saved = true;
10572 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10575 /* When using red zone we may start register saving before allocating
10576 the stack frame saving one cycle of the prologue. However, avoid
10577 doing this if we have to probe the stack; at least on x86_64 the
10578 stack probe can turn into a call that clobbers a red zone location. */
10579 else if (ix86_using_red_zone ()
10580 && (! TARGET_STACK_PROBE
10581 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10583 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10584 int_registers_saved = true;
10588 if (stack_realign_fp)
10590 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10591 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10593 /* The computation of the size of the re-aligned stack frame means
10594 that we must allocate the size of the register save area before
10595 performing the actual alignment. Otherwise we cannot guarantee
10596 that there's enough storage above the realignment point. */
10597 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10598 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10599 GEN_INT (m->fs.sp_offset
10600 - frame.sse_reg_save_offset),
10601 -1, false);
10603 /* Align the stack. */
10604 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10605 stack_pointer_rtx,
10606 GEN_INT (-align_bytes)));
10608 /* For the purposes of register save area addressing, the stack
10609 pointer is no longer valid. As for the value of sp_offset,
10610 see ix86_compute_frame_layout, which we need to match in order
10611 to pass verification of stack_pointer_offset at the end. */
10612 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10613 m->fs.sp_valid = false;
10616 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10618 if (flag_stack_usage_info)
10620 /* We start to count from ARG_POINTER. */
10621 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10623 /* If it was realigned, take into account the fake frame. */
10624 if (stack_realign_drap)
10626 if (ix86_static_chain_on_stack)
10627 stack_size += UNITS_PER_WORD;
10629 if (!call_used_regs[REGNO (crtl->drap_reg)])
10630 stack_size += UNITS_PER_WORD;
10632 /* This over-estimates by 1 minimal-stack-alignment-unit but
10633 mitigates that by counting in the new return address slot. */
10634 current_function_dynamic_stack_size
10635 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10638 current_function_static_stack_size = stack_size;
10641 /* On SEH target with very large frame size, allocate an area to save
10642 SSE registers (as the very large allocation won't be described). */
10643 if (TARGET_SEH
10644 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10645 && !sse_registers_saved)
10647 HOST_WIDE_INT sse_size =
10648 frame.sse_reg_save_offset - frame.reg_save_offset;
10650 gcc_assert (int_registers_saved);
10652 /* No need to do stack checking as the area will be immediately
10653 written. */
10654 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10655 GEN_INT (-sse_size), -1,
10656 m->fs.cfa_reg == stack_pointer_rtx);
10657 allocate -= sse_size;
10658 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10659 sse_registers_saved = true;
10662 /* The stack has already been decremented by the instruction calling us
10663 so probe if the size is non-negative to preserve the protection area. */
10664 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10666 /* We expect the registers to be saved when probes are used. */
10667 gcc_assert (int_registers_saved);
10669 if (STACK_CHECK_MOVING_SP)
10671 ix86_adjust_stack_and_probe (allocate);
10672 allocate = 0;
10674 else
10676 HOST_WIDE_INT size = allocate;
10678 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10679 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10681 if (TARGET_STACK_PROBE)
10682 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10683 else
10684 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10688 if (allocate == 0)
10690 else if (!ix86_target_stack_probe ()
10691 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10693 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10694 GEN_INT (-allocate), -1,
10695 m->fs.cfa_reg == stack_pointer_rtx);
10697 else
10699 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10700 rtx r10 = NULL;
10701 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10702 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10703 bool eax_live = false;
10704 bool r10_live = false;
10706 if (TARGET_64BIT)
10707 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10708 if (!TARGET_64BIT_MS_ABI)
10709 eax_live = ix86_eax_live_at_start_p ();
10711 /* Note that SEH directives need to continue tracking the stack
10712 pointer even after the frame pointer has been set up. */
10713 if (eax_live)
10715 insn = emit_insn (gen_push (eax));
10716 allocate -= UNITS_PER_WORD;
10717 if (sp_is_cfa_reg || TARGET_SEH)
10719 if (sp_is_cfa_reg)
10720 m->fs.cfa_offset += UNITS_PER_WORD;
10721 RTX_FRAME_RELATED_P (insn) = 1;
10725 if (r10_live)
10727 r10 = gen_rtx_REG (Pmode, R10_REG);
10728 insn = emit_insn (gen_push (r10));
10729 allocate -= UNITS_PER_WORD;
10730 if (sp_is_cfa_reg || TARGET_SEH)
10732 if (sp_is_cfa_reg)
10733 m->fs.cfa_offset += UNITS_PER_WORD;
10734 RTX_FRAME_RELATED_P (insn) = 1;
10738 emit_move_insn (eax, GEN_INT (allocate));
10739 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10741 /* Use the fact that AX still contains ALLOCATE. */
10742 adjust_stack_insn = (Pmode == DImode
10743 ? gen_pro_epilogue_adjust_stack_di_sub
10744 : gen_pro_epilogue_adjust_stack_si_sub);
10746 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10747 stack_pointer_rtx, eax));
10749 if (sp_is_cfa_reg || TARGET_SEH)
10751 if (sp_is_cfa_reg)
10752 m->fs.cfa_offset += allocate;
10753 RTX_FRAME_RELATED_P (insn) = 1;
10754 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10755 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10756 plus_constant (Pmode, stack_pointer_rtx,
10757 -allocate)));
10759 m->fs.sp_offset += allocate;
10761 if (r10_live && eax_live)
10763 t = choose_baseaddr (m->fs.sp_offset - allocate);
10764 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10765 gen_frame_mem (word_mode, t));
10766 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10767 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10768 gen_frame_mem (word_mode, t));
10770 else if (eax_live || r10_live)
10772 t = choose_baseaddr (m->fs.sp_offset - allocate);
10773 emit_move_insn (gen_rtx_REG (word_mode,
10774 (eax_live ? AX_REG : R10_REG)),
10775 gen_frame_mem (word_mode, t));
10778 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10780 /* If we havn't already set up the frame pointer, do so now. */
10781 if (frame_pointer_needed && !m->fs.fp_valid)
10783 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10784 GEN_INT (frame.stack_pointer_offset
10785 - frame.hard_frame_pointer_offset));
10786 insn = emit_insn (insn);
10787 RTX_FRAME_RELATED_P (insn) = 1;
10788 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10790 if (m->fs.cfa_reg == stack_pointer_rtx)
10791 m->fs.cfa_reg = hard_frame_pointer_rtx;
10792 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10793 m->fs.fp_valid = true;
10796 if (!int_registers_saved)
10797 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10798 if (!sse_registers_saved)
10799 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10801 pic_reg_used = false;
10802 /* We don't use pic-register for pe-coff target. */
10803 if (pic_offset_table_rtx
10804 && !TARGET_PECOFF
10805 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10806 || crtl->profile))
10808 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10810 if (alt_pic_reg_used != INVALID_REGNUM)
10811 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10813 pic_reg_used = true;
10816 if (pic_reg_used)
10818 if (TARGET_64BIT)
10820 if (ix86_cmodel == CM_LARGE_PIC)
10822 rtx label, tmp_reg;
10824 gcc_assert (Pmode == DImode);
10825 label = gen_label_rtx ();
10826 emit_label (label);
10827 LABEL_PRESERVE_P (label) = 1;
10828 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10829 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10830 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10831 label));
10832 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10833 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10834 pic_offset_table_rtx, tmp_reg));
10836 else
10837 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10839 else
10841 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10842 RTX_FRAME_RELATED_P (insn) = 1;
10843 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10847 /* In the pic_reg_used case, make sure that the got load isn't deleted
10848 when mcount needs it. Blockage to avoid call movement across mcount
10849 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10850 note. */
10851 if (crtl->profile && !flag_fentry && pic_reg_used)
10852 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10854 if (crtl->drap_reg && !crtl->stack_realign_needed)
10856 /* vDRAP is setup but after reload it turns out stack realign
10857 isn't necessary, here we will emit prologue to setup DRAP
10858 without stack realign adjustment */
10859 t = choose_baseaddr (0);
10860 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10863 /* Prevent instructions from being scheduled into register save push
10864 sequence when access to the redzone area is done through frame pointer.
10865 The offset between the frame pointer and the stack pointer is calculated
10866 relative to the value of the stack pointer at the end of the function
10867 prologue, and moving instructions that access redzone area via frame
10868 pointer inside push sequence violates this assumption. */
10869 if (frame_pointer_needed && frame.red_zone_size)
10870 emit_insn (gen_memory_blockage ());
10872 /* Emit cld instruction if stringops are used in the function. */
10873 if (TARGET_CLD && ix86_current_function_needs_cld)
10874 emit_insn (gen_cld ());
10876 /* SEH requires that the prologue end within 256 bytes of the start of
10877 the function. Prevent instruction schedules that would extend that.
10878 Further, prevent alloca modifications to the stack pointer from being
10879 combined with prologue modifications. */
10880 if (TARGET_SEH)
10881 emit_insn (gen_prologue_use (stack_pointer_rtx));
10884 /* Emit code to restore REG using a POP insn. */
10886 static void
10887 ix86_emit_restore_reg_using_pop (rtx reg)
10889 struct machine_function *m = cfun->machine;
10890 rtx insn = emit_insn (gen_pop (reg));
10892 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10893 m->fs.sp_offset -= UNITS_PER_WORD;
10895 if (m->fs.cfa_reg == crtl->drap_reg
10896 && REGNO (reg) == REGNO (crtl->drap_reg))
10898 /* Previously we'd represented the CFA as an expression
10899 like *(%ebp - 8). We've just popped that value from
10900 the stack, which means we need to reset the CFA to
10901 the drap register. This will remain until we restore
10902 the stack pointer. */
10903 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10904 RTX_FRAME_RELATED_P (insn) = 1;
10906 /* This means that the DRAP register is valid for addressing too. */
10907 m->fs.drap_valid = true;
10908 return;
10911 if (m->fs.cfa_reg == stack_pointer_rtx)
10913 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10914 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10915 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10916 RTX_FRAME_RELATED_P (insn) = 1;
10918 m->fs.cfa_offset -= UNITS_PER_WORD;
10921 /* When the frame pointer is the CFA, and we pop it, we are
10922 swapping back to the stack pointer as the CFA. This happens
10923 for stack frames that don't allocate other data, so we assume
10924 the stack pointer is now pointing at the return address, i.e.
10925 the function entry state, which makes the offset be 1 word. */
10926 if (reg == hard_frame_pointer_rtx)
10928 m->fs.fp_valid = false;
10929 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10931 m->fs.cfa_reg = stack_pointer_rtx;
10932 m->fs.cfa_offset -= UNITS_PER_WORD;
10934 add_reg_note (insn, REG_CFA_DEF_CFA,
10935 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10936 GEN_INT (m->fs.cfa_offset)));
10937 RTX_FRAME_RELATED_P (insn) = 1;
10942 /* Emit code to restore saved registers using POP insns. */
10944 static void
10945 ix86_emit_restore_regs_using_pop (void)
10947 unsigned int regno;
10949 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10950 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10951 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10954 /* Emit code and notes for the LEAVE instruction. */
10956 static void
10957 ix86_emit_leave (void)
10959 struct machine_function *m = cfun->machine;
10960 rtx insn = emit_insn (ix86_gen_leave ());
10962 ix86_add_queued_cfa_restore_notes (insn);
10964 gcc_assert (m->fs.fp_valid);
10965 m->fs.sp_valid = true;
10966 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10967 m->fs.fp_valid = false;
10969 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10971 m->fs.cfa_reg = stack_pointer_rtx;
10972 m->fs.cfa_offset = m->fs.sp_offset;
10974 add_reg_note (insn, REG_CFA_DEF_CFA,
10975 plus_constant (Pmode, stack_pointer_rtx,
10976 m->fs.sp_offset));
10977 RTX_FRAME_RELATED_P (insn) = 1;
10979 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10980 m->fs.fp_offset);
10983 /* Emit code to restore saved registers using MOV insns.
10984 First register is restored from CFA - CFA_OFFSET. */
10985 static void
10986 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10987 bool maybe_eh_return)
10989 struct machine_function *m = cfun->machine;
10990 unsigned int regno;
10992 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10993 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10995 rtx reg = gen_rtx_REG (word_mode, regno);
10996 rtx insn, mem;
10998 mem = choose_baseaddr (cfa_offset);
10999 mem = gen_frame_mem (word_mode, mem);
11000 insn = emit_move_insn (reg, mem);
11002 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11004 /* Previously we'd represented the CFA as an expression
11005 like *(%ebp - 8). We've just popped that value from
11006 the stack, which means we need to reset the CFA to
11007 the drap register. This will remain until we restore
11008 the stack pointer. */
11009 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11010 RTX_FRAME_RELATED_P (insn) = 1;
11012 /* This means that the DRAP register is valid for addressing. */
11013 m->fs.drap_valid = true;
11015 else
11016 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11018 cfa_offset -= UNITS_PER_WORD;
11022 /* Emit code to restore saved registers using MOV insns.
11023 First register is restored from CFA - CFA_OFFSET. */
11024 static void
11025 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11026 bool maybe_eh_return)
11028 unsigned int regno;
11030 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11031 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11033 rtx reg = gen_rtx_REG (V4SFmode, regno);
11034 rtx mem;
11036 mem = choose_baseaddr (cfa_offset);
11037 mem = gen_rtx_MEM (V4SFmode, mem);
11038 set_mem_align (mem, 128);
11039 emit_move_insn (reg, mem);
11041 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11043 cfa_offset -= 16;
11047 /* Restore function stack, frame, and registers. */
11049 void
11050 ix86_expand_epilogue (int style)
11052 struct machine_function *m = cfun->machine;
11053 struct machine_frame_state frame_state_save = m->fs;
11054 struct ix86_frame frame;
11055 bool restore_regs_via_mov;
11056 bool using_drap;
11058 ix86_finalize_stack_realign_flags ();
11059 ix86_compute_frame_layout (&frame);
11061 m->fs.sp_valid = (!frame_pointer_needed
11062 || (crtl->sp_is_unchanging
11063 && !stack_realign_fp));
11064 gcc_assert (!m->fs.sp_valid
11065 || m->fs.sp_offset == frame.stack_pointer_offset);
11067 /* The FP must be valid if the frame pointer is present. */
11068 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11069 gcc_assert (!m->fs.fp_valid
11070 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11072 /* We must have *some* valid pointer to the stack frame. */
11073 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11075 /* The DRAP is never valid at this point. */
11076 gcc_assert (!m->fs.drap_valid);
11078 /* See the comment about red zone and frame
11079 pointer usage in ix86_expand_prologue. */
11080 if (frame_pointer_needed && frame.red_zone_size)
11081 emit_insn (gen_memory_blockage ());
11083 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11084 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11086 /* Determine the CFA offset of the end of the red-zone. */
11087 m->fs.red_zone_offset = 0;
11088 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11090 /* The red-zone begins below the return address. */
11091 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11093 /* When the register save area is in the aligned portion of
11094 the stack, determine the maximum runtime displacement that
11095 matches up with the aligned frame. */
11096 if (stack_realign_drap)
11097 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11098 + UNITS_PER_WORD);
11101 /* Special care must be taken for the normal return case of a function
11102 using eh_return: the eax and edx registers are marked as saved, but
11103 not restored along this path. Adjust the save location to match. */
11104 if (crtl->calls_eh_return && style != 2)
11105 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11107 /* EH_RETURN requires the use of moves to function properly. */
11108 if (crtl->calls_eh_return)
11109 restore_regs_via_mov = true;
11110 /* SEH requires the use of pops to identify the epilogue. */
11111 else if (TARGET_SEH)
11112 restore_regs_via_mov = false;
11113 /* If we're only restoring one register and sp is not valid then
11114 using a move instruction to restore the register since it's
11115 less work than reloading sp and popping the register. */
11116 else if (!m->fs.sp_valid && frame.nregs <= 1)
11117 restore_regs_via_mov = true;
11118 else if (TARGET_EPILOGUE_USING_MOVE
11119 && cfun->machine->use_fast_prologue_epilogue
11120 && (frame.nregs > 1
11121 || m->fs.sp_offset != frame.reg_save_offset))
11122 restore_regs_via_mov = true;
11123 else if (frame_pointer_needed
11124 && !frame.nregs
11125 && m->fs.sp_offset != frame.reg_save_offset)
11126 restore_regs_via_mov = true;
11127 else if (frame_pointer_needed
11128 && TARGET_USE_LEAVE
11129 && cfun->machine->use_fast_prologue_epilogue
11130 && frame.nregs == 1)
11131 restore_regs_via_mov = true;
11132 else
11133 restore_regs_via_mov = false;
11135 if (restore_regs_via_mov || frame.nsseregs)
11137 /* Ensure that the entire register save area is addressable via
11138 the stack pointer, if we will restore via sp. */
11139 if (TARGET_64BIT
11140 && m->fs.sp_offset > 0x7fffffff
11141 && !(m->fs.fp_valid || m->fs.drap_valid)
11142 && (frame.nsseregs + frame.nregs) != 0)
11144 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11145 GEN_INT (m->fs.sp_offset
11146 - frame.sse_reg_save_offset),
11147 style,
11148 m->fs.cfa_reg == stack_pointer_rtx);
11152 /* If there are any SSE registers to restore, then we have to do it
11153 via moves, since there's obviously no pop for SSE regs. */
11154 if (frame.nsseregs)
11155 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11156 style == 2);
11158 if (restore_regs_via_mov)
11160 rtx t;
11162 if (frame.nregs)
11163 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11165 /* eh_return epilogues need %ecx added to the stack pointer. */
11166 if (style == 2)
11168 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11170 /* Stack align doesn't work with eh_return. */
11171 gcc_assert (!stack_realign_drap);
11172 /* Neither does regparm nested functions. */
11173 gcc_assert (!ix86_static_chain_on_stack);
11175 if (frame_pointer_needed)
11177 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11178 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11179 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11181 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11182 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11184 /* Note that we use SA as a temporary CFA, as the return
11185 address is at the proper place relative to it. We
11186 pretend this happens at the FP restore insn because
11187 prior to this insn the FP would be stored at the wrong
11188 offset relative to SA, and after this insn we have no
11189 other reasonable register to use for the CFA. We don't
11190 bother resetting the CFA to the SP for the duration of
11191 the return insn. */
11192 add_reg_note (insn, REG_CFA_DEF_CFA,
11193 plus_constant (Pmode, sa, UNITS_PER_WORD));
11194 ix86_add_queued_cfa_restore_notes (insn);
11195 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11196 RTX_FRAME_RELATED_P (insn) = 1;
11198 m->fs.cfa_reg = sa;
11199 m->fs.cfa_offset = UNITS_PER_WORD;
11200 m->fs.fp_valid = false;
11202 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11203 const0_rtx, style, false);
11205 else
11207 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11208 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11209 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11210 ix86_add_queued_cfa_restore_notes (insn);
11212 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11213 if (m->fs.cfa_offset != UNITS_PER_WORD)
11215 m->fs.cfa_offset = UNITS_PER_WORD;
11216 add_reg_note (insn, REG_CFA_DEF_CFA,
11217 plus_constant (Pmode, stack_pointer_rtx,
11218 UNITS_PER_WORD));
11219 RTX_FRAME_RELATED_P (insn) = 1;
11222 m->fs.sp_offset = UNITS_PER_WORD;
11223 m->fs.sp_valid = true;
11226 else
11228 /* SEH requires that the function end with (1) a stack adjustment
11229 if necessary, (2) a sequence of pops, and (3) a return or
11230 jump instruction. Prevent insns from the function body from
11231 being scheduled into this sequence. */
11232 if (TARGET_SEH)
11234 /* Prevent a catch region from being adjacent to the standard
11235 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11236 several other flags that would be interesting to test are
11237 not yet set up. */
11238 if (flag_non_call_exceptions)
11239 emit_insn (gen_nops (const1_rtx));
11240 else
11241 emit_insn (gen_blockage ());
11244 /* First step is to deallocate the stack frame so that we can
11245 pop the registers. Also do it on SEH target for very large
11246 frame as the emitted instructions aren't allowed by the ABI in
11247 epilogues. */
11248 if (!m->fs.sp_valid
11249 || (TARGET_SEH
11250 && (m->fs.sp_offset - frame.reg_save_offset
11251 >= SEH_MAX_FRAME_SIZE)))
11253 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11254 GEN_INT (m->fs.fp_offset
11255 - frame.reg_save_offset),
11256 style, false);
11258 else if (m->fs.sp_offset != frame.reg_save_offset)
11260 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11261 GEN_INT (m->fs.sp_offset
11262 - frame.reg_save_offset),
11263 style,
11264 m->fs.cfa_reg == stack_pointer_rtx);
11267 ix86_emit_restore_regs_using_pop ();
11270 /* If we used a stack pointer and haven't already got rid of it,
11271 then do so now. */
11272 if (m->fs.fp_valid)
11274 /* If the stack pointer is valid and pointing at the frame
11275 pointer store address, then we only need a pop. */
11276 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11277 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11278 /* Leave results in shorter dependency chains on CPUs that are
11279 able to grok it fast. */
11280 else if (TARGET_USE_LEAVE
11281 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11282 || !cfun->machine->use_fast_prologue_epilogue)
11283 ix86_emit_leave ();
11284 else
11286 pro_epilogue_adjust_stack (stack_pointer_rtx,
11287 hard_frame_pointer_rtx,
11288 const0_rtx, style, !using_drap);
11289 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11293 if (using_drap)
11295 int param_ptr_offset = UNITS_PER_WORD;
11296 rtx insn;
11298 gcc_assert (stack_realign_drap);
11300 if (ix86_static_chain_on_stack)
11301 param_ptr_offset += UNITS_PER_WORD;
11302 if (!call_used_regs[REGNO (crtl->drap_reg)])
11303 param_ptr_offset += UNITS_PER_WORD;
11305 insn = emit_insn (gen_rtx_SET
11306 (VOIDmode, stack_pointer_rtx,
11307 gen_rtx_PLUS (Pmode,
11308 crtl->drap_reg,
11309 GEN_INT (-param_ptr_offset))));
11310 m->fs.cfa_reg = stack_pointer_rtx;
11311 m->fs.cfa_offset = param_ptr_offset;
11312 m->fs.sp_offset = param_ptr_offset;
11313 m->fs.realigned = false;
11315 add_reg_note (insn, REG_CFA_DEF_CFA,
11316 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11317 GEN_INT (param_ptr_offset)));
11318 RTX_FRAME_RELATED_P (insn) = 1;
11320 if (!call_used_regs[REGNO (crtl->drap_reg)])
11321 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11324 /* At this point the stack pointer must be valid, and we must have
11325 restored all of the registers. We may not have deallocated the
11326 entire stack frame. We've delayed this until now because it may
11327 be possible to merge the local stack deallocation with the
11328 deallocation forced by ix86_static_chain_on_stack. */
11329 gcc_assert (m->fs.sp_valid);
11330 gcc_assert (!m->fs.fp_valid);
11331 gcc_assert (!m->fs.realigned);
11332 if (m->fs.sp_offset != UNITS_PER_WORD)
11334 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11335 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11336 style, true);
11338 else
11339 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11341 /* Sibcall epilogues don't want a return instruction. */
11342 if (style == 0)
11344 m->fs = frame_state_save;
11345 return;
11348 if (crtl->args.pops_args && crtl->args.size)
11350 rtx popc = GEN_INT (crtl->args.pops_args);
11352 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11353 address, do explicit add, and jump indirectly to the caller. */
11355 if (crtl->args.pops_args >= 65536)
11357 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11358 rtx insn;
11360 /* There is no "pascal" calling convention in any 64bit ABI. */
11361 gcc_assert (!TARGET_64BIT);
11363 insn = emit_insn (gen_pop (ecx));
11364 m->fs.cfa_offset -= UNITS_PER_WORD;
11365 m->fs.sp_offset -= UNITS_PER_WORD;
11367 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11368 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11369 add_reg_note (insn, REG_CFA_REGISTER,
11370 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11371 RTX_FRAME_RELATED_P (insn) = 1;
11373 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11374 popc, -1, true);
11375 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11377 else
11378 emit_jump_insn (gen_simple_return_pop_internal (popc));
11380 else
11381 emit_jump_insn (gen_simple_return_internal ());
11383 /* Restore the state back to the state from the prologue,
11384 so that it's correct for the next epilogue. */
11385 m->fs = frame_state_save;
11388 /* Reset from the function's potential modifications. */
11390 static void
11391 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11392 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11394 if (pic_offset_table_rtx)
11395 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11396 #if TARGET_MACHO
11397 /* Mach-O doesn't support labels at the end of objects, so if
11398 it looks like we might want one, insert a NOP. */
11400 rtx insn = get_last_insn ();
11401 rtx deleted_debug_label = NULL_RTX;
11402 while (insn
11403 && NOTE_P (insn)
11404 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11406 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11407 notes only, instead set their CODE_LABEL_NUMBER to -1,
11408 otherwise there would be code generation differences
11409 in between -g and -g0. */
11410 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11411 deleted_debug_label = insn;
11412 insn = PREV_INSN (insn);
11414 if (insn
11415 && (LABEL_P (insn)
11416 || (NOTE_P (insn)
11417 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11418 fputs ("\tnop\n", file);
11419 else if (deleted_debug_label)
11420 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11421 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11422 CODE_LABEL_NUMBER (insn) = -1;
11424 #endif
11428 /* Return a scratch register to use in the split stack prologue. The
11429 split stack prologue is used for -fsplit-stack. It is the first
11430 instructions in the function, even before the regular prologue.
11431 The scratch register can be any caller-saved register which is not
11432 used for parameters or for the static chain. */
11434 static unsigned int
11435 split_stack_prologue_scratch_regno (void)
11437 if (TARGET_64BIT)
11438 return R11_REG;
11439 else
11441 bool is_fastcall, is_thiscall;
11442 int regparm;
11444 is_fastcall = (lookup_attribute ("fastcall",
11445 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11446 != NULL);
11447 is_thiscall = (lookup_attribute ("thiscall",
11448 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11449 != NULL);
11450 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11452 if (is_fastcall)
11454 if (DECL_STATIC_CHAIN (cfun->decl))
11456 sorry ("-fsplit-stack does not support fastcall with "
11457 "nested function");
11458 return INVALID_REGNUM;
11460 return AX_REG;
11462 else if (is_thiscall)
11464 if (!DECL_STATIC_CHAIN (cfun->decl))
11465 return DX_REG;
11466 return AX_REG;
11468 else if (regparm < 3)
11470 if (!DECL_STATIC_CHAIN (cfun->decl))
11471 return CX_REG;
11472 else
11474 if (regparm >= 2)
11476 sorry ("-fsplit-stack does not support 2 register "
11477 " parameters for a nested function");
11478 return INVALID_REGNUM;
11480 return DX_REG;
11483 else
11485 /* FIXME: We could make this work by pushing a register
11486 around the addition and comparison. */
11487 sorry ("-fsplit-stack does not support 3 register parameters");
11488 return INVALID_REGNUM;
11493 /* A SYMBOL_REF for the function which allocates new stackspace for
11494 -fsplit-stack. */
11496 static GTY(()) rtx split_stack_fn;
11498 /* A SYMBOL_REF for the more stack function when using the large
11499 model. */
11501 static GTY(()) rtx split_stack_fn_large;
11503 /* Handle -fsplit-stack. These are the first instructions in the
11504 function, even before the regular prologue. */
11506 void
11507 ix86_expand_split_stack_prologue (void)
11509 struct ix86_frame frame;
11510 HOST_WIDE_INT allocate;
11511 unsigned HOST_WIDE_INT args_size;
11512 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11513 rtx scratch_reg = NULL_RTX;
11514 rtx varargs_label = NULL_RTX;
11515 rtx fn;
11517 gcc_assert (flag_split_stack && reload_completed);
11519 ix86_finalize_stack_realign_flags ();
11520 ix86_compute_frame_layout (&frame);
11521 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11523 /* This is the label we will branch to if we have enough stack
11524 space. We expect the basic block reordering pass to reverse this
11525 branch if optimizing, so that we branch in the unlikely case. */
11526 label = gen_label_rtx ();
11528 /* We need to compare the stack pointer minus the frame size with
11529 the stack boundary in the TCB. The stack boundary always gives
11530 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11531 can compare directly. Otherwise we need to do an addition. */
11533 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11534 UNSPEC_STACK_CHECK);
11535 limit = gen_rtx_CONST (Pmode, limit);
11536 limit = gen_rtx_MEM (Pmode, limit);
11537 if (allocate < SPLIT_STACK_AVAILABLE)
11538 current = stack_pointer_rtx;
11539 else
11541 unsigned int scratch_regno;
11542 rtx offset;
11544 /* We need a scratch register to hold the stack pointer minus
11545 the required frame size. Since this is the very start of the
11546 function, the scratch register can be any caller-saved
11547 register which is not used for parameters. */
11548 offset = GEN_INT (- allocate);
11549 scratch_regno = split_stack_prologue_scratch_regno ();
11550 if (scratch_regno == INVALID_REGNUM)
11551 return;
11552 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11553 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11555 /* We don't use ix86_gen_add3 in this case because it will
11556 want to split to lea, but when not optimizing the insn
11557 will not be split after this point. */
11558 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11559 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11560 offset)));
11562 else
11564 emit_move_insn (scratch_reg, offset);
11565 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11566 stack_pointer_rtx));
11568 current = scratch_reg;
11571 ix86_expand_branch (GEU, current, limit, label);
11572 jump_insn = get_last_insn ();
11573 JUMP_LABEL (jump_insn) = label;
11575 /* Mark the jump as very likely to be taken. */
11576 add_int_reg_note (jump_insn, REG_BR_PROB,
11577 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11579 if (split_stack_fn == NULL_RTX)
11580 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11581 fn = split_stack_fn;
11583 /* Get more stack space. We pass in the desired stack space and the
11584 size of the arguments to copy to the new stack. In 32-bit mode
11585 we push the parameters; __morestack will return on a new stack
11586 anyhow. In 64-bit mode we pass the parameters in r10 and
11587 r11. */
11588 allocate_rtx = GEN_INT (allocate);
11589 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11590 call_fusage = NULL_RTX;
11591 if (TARGET_64BIT)
11593 rtx reg10, reg11;
11595 reg10 = gen_rtx_REG (Pmode, R10_REG);
11596 reg11 = gen_rtx_REG (Pmode, R11_REG);
11598 /* If this function uses a static chain, it will be in %r10.
11599 Preserve it across the call to __morestack. */
11600 if (DECL_STATIC_CHAIN (cfun->decl))
11602 rtx rax;
11604 rax = gen_rtx_REG (word_mode, AX_REG);
11605 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11606 use_reg (&call_fusage, rax);
11609 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11610 && !TARGET_PECOFF)
11612 HOST_WIDE_INT argval;
11614 gcc_assert (Pmode == DImode);
11615 /* When using the large model we need to load the address
11616 into a register, and we've run out of registers. So we
11617 switch to a different calling convention, and we call a
11618 different function: __morestack_large. We pass the
11619 argument size in the upper 32 bits of r10 and pass the
11620 frame size in the lower 32 bits. */
11621 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11622 gcc_assert ((args_size & 0xffffffff) == args_size);
11624 if (split_stack_fn_large == NULL_RTX)
11625 split_stack_fn_large =
11626 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11628 if (ix86_cmodel == CM_LARGE_PIC)
11630 rtx label, x;
11632 label = gen_label_rtx ();
11633 emit_label (label);
11634 LABEL_PRESERVE_P (label) = 1;
11635 emit_insn (gen_set_rip_rex64 (reg10, label));
11636 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11637 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11638 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11639 UNSPEC_GOT);
11640 x = gen_rtx_CONST (Pmode, x);
11641 emit_move_insn (reg11, x);
11642 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11643 x = gen_const_mem (Pmode, x);
11644 emit_move_insn (reg11, x);
11646 else
11647 emit_move_insn (reg11, split_stack_fn_large);
11649 fn = reg11;
11651 argval = ((args_size << 16) << 16) + allocate;
11652 emit_move_insn (reg10, GEN_INT (argval));
11654 else
11656 emit_move_insn (reg10, allocate_rtx);
11657 emit_move_insn (reg11, GEN_INT (args_size));
11658 use_reg (&call_fusage, reg11);
11661 use_reg (&call_fusage, reg10);
11663 else
11665 emit_insn (gen_push (GEN_INT (args_size)));
11666 emit_insn (gen_push (allocate_rtx));
11668 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11669 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11670 NULL_RTX, false);
11671 add_function_usage_to (call_insn, call_fusage);
11673 /* In order to make call/return prediction work right, we now need
11674 to execute a return instruction. See
11675 libgcc/config/i386/morestack.S for the details on how this works.
11677 For flow purposes gcc must not see this as a return
11678 instruction--we need control flow to continue at the subsequent
11679 label. Therefore, we use an unspec. */
11680 gcc_assert (crtl->args.pops_args < 65536);
11681 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11683 /* If we are in 64-bit mode and this function uses a static chain,
11684 we saved %r10 in %rax before calling _morestack. */
11685 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11686 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11687 gen_rtx_REG (word_mode, AX_REG));
11689 /* If this function calls va_start, we need to store a pointer to
11690 the arguments on the old stack, because they may not have been
11691 all copied to the new stack. At this point the old stack can be
11692 found at the frame pointer value used by __morestack, because
11693 __morestack has set that up before calling back to us. Here we
11694 store that pointer in a scratch register, and in
11695 ix86_expand_prologue we store the scratch register in a stack
11696 slot. */
11697 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11699 unsigned int scratch_regno;
11700 rtx frame_reg;
11701 int words;
11703 scratch_regno = split_stack_prologue_scratch_regno ();
11704 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11705 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11707 /* 64-bit:
11708 fp -> old fp value
11709 return address within this function
11710 return address of caller of this function
11711 stack arguments
11712 So we add three words to get to the stack arguments.
11714 32-bit:
11715 fp -> old fp value
11716 return address within this function
11717 first argument to __morestack
11718 second argument to __morestack
11719 return address of caller of this function
11720 stack arguments
11721 So we add five words to get to the stack arguments.
11723 words = TARGET_64BIT ? 3 : 5;
11724 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11725 gen_rtx_PLUS (Pmode, frame_reg,
11726 GEN_INT (words * UNITS_PER_WORD))));
11728 varargs_label = gen_label_rtx ();
11729 emit_jump_insn (gen_jump (varargs_label));
11730 JUMP_LABEL (get_last_insn ()) = varargs_label;
11732 emit_barrier ();
11735 emit_label (label);
11736 LABEL_NUSES (label) = 1;
11738 /* If this function calls va_start, we now have to set the scratch
11739 register for the case where we do not call __morestack. In this
11740 case we need to set it based on the stack pointer. */
11741 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11743 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11744 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11745 GEN_INT (UNITS_PER_WORD))));
11747 emit_label (varargs_label);
11748 LABEL_NUSES (varargs_label) = 1;
11752 /* We may have to tell the dataflow pass that the split stack prologue
11753 is initializing a scratch register. */
11755 static void
11756 ix86_live_on_entry (bitmap regs)
11758 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11760 gcc_assert (flag_split_stack);
11761 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11765 /* Determine if op is suitable SUBREG RTX for address. */
11767 static bool
11768 ix86_address_subreg_operand (rtx op)
11770 enum machine_mode mode;
11772 if (!REG_P (op))
11773 return false;
11775 mode = GET_MODE (op);
11777 if (GET_MODE_CLASS (mode) != MODE_INT)
11778 return false;
11780 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11781 failures when the register is one word out of a two word structure. */
11782 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11783 return false;
11785 /* Allow only SUBREGs of non-eliminable hard registers. */
11786 return register_no_elim_operand (op, mode);
11789 /* Extract the parts of an RTL expression that is a valid memory address
11790 for an instruction. Return 0 if the structure of the address is
11791 grossly off. Return -1 if the address contains ASHIFT, so it is not
11792 strictly valid, but still used for computing length of lea instruction. */
11795 ix86_decompose_address (rtx addr, struct ix86_address *out)
11797 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11798 rtx base_reg, index_reg;
11799 HOST_WIDE_INT scale = 1;
11800 rtx scale_rtx = NULL_RTX;
11801 rtx tmp;
11802 int retval = 1;
11803 enum ix86_address_seg seg = SEG_DEFAULT;
11805 /* Allow zero-extended SImode addresses,
11806 they will be emitted with addr32 prefix. */
11807 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11809 if (GET_CODE (addr) == ZERO_EXTEND
11810 && GET_MODE (XEXP (addr, 0)) == SImode)
11812 addr = XEXP (addr, 0);
11813 if (CONST_INT_P (addr))
11814 return 0;
11816 else if (GET_CODE (addr) == AND
11817 && const_32bit_mask (XEXP (addr, 1), DImode))
11819 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11820 if (addr == NULL_RTX)
11821 return 0;
11823 if (CONST_INT_P (addr))
11824 return 0;
11828 /* Allow SImode subregs of DImode addresses,
11829 they will be emitted with addr32 prefix. */
11830 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11832 if (GET_CODE (addr) == SUBREG
11833 && GET_MODE (SUBREG_REG (addr)) == DImode)
11835 addr = SUBREG_REG (addr);
11836 if (CONST_INT_P (addr))
11837 return 0;
11841 if (REG_P (addr))
11842 base = addr;
11843 else if (GET_CODE (addr) == SUBREG)
11845 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11846 base = addr;
11847 else
11848 return 0;
11850 else if (GET_CODE (addr) == PLUS)
11852 rtx addends[4], op;
11853 int n = 0, i;
11855 op = addr;
11858 if (n >= 4)
11859 return 0;
11860 addends[n++] = XEXP (op, 1);
11861 op = XEXP (op, 0);
11863 while (GET_CODE (op) == PLUS);
11864 if (n >= 4)
11865 return 0;
11866 addends[n] = op;
11868 for (i = n; i >= 0; --i)
11870 op = addends[i];
11871 switch (GET_CODE (op))
11873 case MULT:
11874 if (index)
11875 return 0;
11876 index = XEXP (op, 0);
11877 scale_rtx = XEXP (op, 1);
11878 break;
11880 case ASHIFT:
11881 if (index)
11882 return 0;
11883 index = XEXP (op, 0);
11884 tmp = XEXP (op, 1);
11885 if (!CONST_INT_P (tmp))
11886 return 0;
11887 scale = INTVAL (tmp);
11888 if ((unsigned HOST_WIDE_INT) scale > 3)
11889 return 0;
11890 scale = 1 << scale;
11891 break;
11893 case ZERO_EXTEND:
11894 op = XEXP (op, 0);
11895 if (GET_CODE (op) != UNSPEC)
11896 return 0;
11897 /* FALLTHRU */
11899 case UNSPEC:
11900 if (XINT (op, 1) == UNSPEC_TP
11901 && TARGET_TLS_DIRECT_SEG_REFS
11902 && seg == SEG_DEFAULT)
11903 seg = DEFAULT_TLS_SEG_REG;
11904 else
11905 return 0;
11906 break;
11908 case SUBREG:
11909 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11910 return 0;
11911 /* FALLTHRU */
11913 case REG:
11914 if (!base)
11915 base = op;
11916 else if (!index)
11917 index = op;
11918 else
11919 return 0;
11920 break;
11922 case CONST:
11923 case CONST_INT:
11924 case SYMBOL_REF:
11925 case LABEL_REF:
11926 if (disp)
11927 return 0;
11928 disp = op;
11929 break;
11931 default:
11932 return 0;
11936 else if (GET_CODE (addr) == MULT)
11938 index = XEXP (addr, 0); /* index*scale */
11939 scale_rtx = XEXP (addr, 1);
11941 else if (GET_CODE (addr) == ASHIFT)
11943 /* We're called for lea too, which implements ashift on occasion. */
11944 index = XEXP (addr, 0);
11945 tmp = XEXP (addr, 1);
11946 if (!CONST_INT_P (tmp))
11947 return 0;
11948 scale = INTVAL (tmp);
11949 if ((unsigned HOST_WIDE_INT) scale > 3)
11950 return 0;
11951 scale = 1 << scale;
11952 retval = -1;
11954 else if (CONST_INT_P (addr))
11956 if (!x86_64_immediate_operand (addr, VOIDmode))
11957 return 0;
11959 /* Constant addresses are sign extended to 64bit, we have to
11960 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11961 if (TARGET_X32
11962 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11963 return 0;
11965 disp = addr;
11967 else
11968 disp = addr; /* displacement */
11970 if (index)
11972 if (REG_P (index))
11974 else if (GET_CODE (index) == SUBREG
11975 && ix86_address_subreg_operand (SUBREG_REG (index)))
11977 else
11978 return 0;
11981 /* Address override works only on the (%reg) part of %fs:(%reg). */
11982 if (seg != SEG_DEFAULT
11983 && ((base && GET_MODE (base) != word_mode)
11984 || (index && GET_MODE (index) != word_mode)))
11985 return 0;
11987 /* Extract the integral value of scale. */
11988 if (scale_rtx)
11990 if (!CONST_INT_P (scale_rtx))
11991 return 0;
11992 scale = INTVAL (scale_rtx);
11995 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11996 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11998 /* Avoid useless 0 displacement. */
11999 if (disp == const0_rtx && (base || index))
12000 disp = NULL_RTX;
12002 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12003 if (base_reg && index_reg && scale == 1
12004 && (index_reg == arg_pointer_rtx
12005 || index_reg == frame_pointer_rtx
12006 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12008 rtx tmp;
12009 tmp = base, base = index, index = tmp;
12010 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12013 /* Special case: %ebp cannot be encoded as a base without a displacement.
12014 Similarly %r13. */
12015 if (!disp
12016 && base_reg
12017 && (base_reg == hard_frame_pointer_rtx
12018 || base_reg == frame_pointer_rtx
12019 || base_reg == arg_pointer_rtx
12020 || (REG_P (base_reg)
12021 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12022 || REGNO (base_reg) == R13_REG))))
12023 disp = const0_rtx;
12025 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12026 Avoid this by transforming to [%esi+0].
12027 Reload calls address legitimization without cfun defined, so we need
12028 to test cfun for being non-NULL. */
12029 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12030 && base_reg && !index_reg && !disp
12031 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12032 disp = const0_rtx;
12034 /* Special case: encode reg+reg instead of reg*2. */
12035 if (!base && index && scale == 2)
12036 base = index, base_reg = index_reg, scale = 1;
12038 /* Special case: scaling cannot be encoded without base or displacement. */
12039 if (!base && !disp && index && scale != 1)
12040 disp = const0_rtx;
12042 out->base = base;
12043 out->index = index;
12044 out->disp = disp;
12045 out->scale = scale;
12046 out->seg = seg;
12048 return retval;
12051 /* Return cost of the memory address x.
12052 For i386, it is better to use a complex address than let gcc copy
12053 the address into a reg and make a new pseudo. But not if the address
12054 requires to two regs - that would mean more pseudos with longer
12055 lifetimes. */
12056 static int
12057 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12058 addr_space_t as ATTRIBUTE_UNUSED,
12059 bool speed ATTRIBUTE_UNUSED)
12061 struct ix86_address parts;
12062 int cost = 1;
12063 int ok = ix86_decompose_address (x, &parts);
12065 gcc_assert (ok);
12067 if (parts.base && GET_CODE (parts.base) == SUBREG)
12068 parts.base = SUBREG_REG (parts.base);
12069 if (parts.index && GET_CODE (parts.index) == SUBREG)
12070 parts.index = SUBREG_REG (parts.index);
12072 /* Attempt to minimize number of registers in the address. */
12073 if ((parts.base
12074 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12075 || (parts.index
12076 && (!REG_P (parts.index)
12077 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12078 cost++;
12080 if (parts.base
12081 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12082 && parts.index
12083 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12084 && parts.base != parts.index)
12085 cost++;
12087 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12088 since it's predecode logic can't detect the length of instructions
12089 and it degenerates to vector decoded. Increase cost of such
12090 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12091 to split such addresses or even refuse such addresses at all.
12093 Following addressing modes are affected:
12094 [base+scale*index]
12095 [scale*index+disp]
12096 [base+index]
12098 The first and last case may be avoidable by explicitly coding the zero in
12099 memory address, but I don't have AMD-K6 machine handy to check this
12100 theory. */
12102 if (TARGET_K6
12103 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12104 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12105 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12106 cost += 10;
12108 return cost;
12111 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12112 this is used for to form addresses to local data when -fPIC is in
12113 use. */
12115 static bool
12116 darwin_local_data_pic (rtx disp)
12118 return (GET_CODE (disp) == UNSPEC
12119 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12122 /* Determine if a given RTX is a valid constant. We already know this
12123 satisfies CONSTANT_P. */
12125 static bool
12126 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12128 switch (GET_CODE (x))
12130 case CONST:
12131 x = XEXP (x, 0);
12133 if (GET_CODE (x) == PLUS)
12135 if (!CONST_INT_P (XEXP (x, 1)))
12136 return false;
12137 x = XEXP (x, 0);
12140 if (TARGET_MACHO && darwin_local_data_pic (x))
12141 return true;
12143 /* Only some unspecs are valid as "constants". */
12144 if (GET_CODE (x) == UNSPEC)
12145 switch (XINT (x, 1))
12147 case UNSPEC_GOT:
12148 case UNSPEC_GOTOFF:
12149 case UNSPEC_PLTOFF:
12150 return TARGET_64BIT;
12151 case UNSPEC_TPOFF:
12152 case UNSPEC_NTPOFF:
12153 x = XVECEXP (x, 0, 0);
12154 return (GET_CODE (x) == SYMBOL_REF
12155 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12156 case UNSPEC_DTPOFF:
12157 x = XVECEXP (x, 0, 0);
12158 return (GET_CODE (x) == SYMBOL_REF
12159 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12160 default:
12161 return false;
12164 /* We must have drilled down to a symbol. */
12165 if (GET_CODE (x) == LABEL_REF)
12166 return true;
12167 if (GET_CODE (x) != SYMBOL_REF)
12168 return false;
12169 /* FALLTHRU */
12171 case SYMBOL_REF:
12172 /* TLS symbols are never valid. */
12173 if (SYMBOL_REF_TLS_MODEL (x))
12174 return false;
12176 /* DLLIMPORT symbols are never valid. */
12177 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12178 && SYMBOL_REF_DLLIMPORT_P (x))
12179 return false;
12181 #if TARGET_MACHO
12182 /* mdynamic-no-pic */
12183 if (MACHO_DYNAMIC_NO_PIC_P)
12184 return machopic_symbol_defined_p (x);
12185 #endif
12186 break;
12188 case CONST_DOUBLE:
12189 if (GET_MODE (x) == TImode
12190 && x != CONST0_RTX (TImode)
12191 && !TARGET_64BIT)
12192 return false;
12193 break;
12195 case CONST_VECTOR:
12196 if (!standard_sse_constant_p (x))
12197 return false;
12199 default:
12200 break;
12203 /* Otherwise we handle everything else in the move patterns. */
12204 return true;
12207 /* Determine if it's legal to put X into the constant pool. This
12208 is not possible for the address of thread-local symbols, which
12209 is checked above. */
12211 static bool
12212 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12214 /* We can always put integral constants and vectors in memory. */
12215 switch (GET_CODE (x))
12217 case CONST_INT:
12218 case CONST_DOUBLE:
12219 case CONST_VECTOR:
12220 return false;
12222 default:
12223 break;
12225 return !ix86_legitimate_constant_p (mode, x);
12228 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12229 otherwise zero. */
12231 static bool
12232 is_imported_p (rtx x)
12234 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12235 || GET_CODE (x) != SYMBOL_REF)
12236 return false;
12238 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12242 /* Nonzero if the constant value X is a legitimate general operand
12243 when generating PIC code. It is given that flag_pic is on and
12244 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12246 bool
12247 legitimate_pic_operand_p (rtx x)
12249 rtx inner;
12251 switch (GET_CODE (x))
12253 case CONST:
12254 inner = XEXP (x, 0);
12255 if (GET_CODE (inner) == PLUS
12256 && CONST_INT_P (XEXP (inner, 1)))
12257 inner = XEXP (inner, 0);
12259 /* Only some unspecs are valid as "constants". */
12260 if (GET_CODE (inner) == UNSPEC)
12261 switch (XINT (inner, 1))
12263 case UNSPEC_GOT:
12264 case UNSPEC_GOTOFF:
12265 case UNSPEC_PLTOFF:
12266 return TARGET_64BIT;
12267 case UNSPEC_TPOFF:
12268 x = XVECEXP (inner, 0, 0);
12269 return (GET_CODE (x) == SYMBOL_REF
12270 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12271 case UNSPEC_MACHOPIC_OFFSET:
12272 return legitimate_pic_address_disp_p (x);
12273 default:
12274 return false;
12276 /* FALLTHRU */
12278 case SYMBOL_REF:
12279 case LABEL_REF:
12280 return legitimate_pic_address_disp_p (x);
12282 default:
12283 return true;
12287 /* Determine if a given CONST RTX is a valid memory displacement
12288 in PIC mode. */
12290 bool
12291 legitimate_pic_address_disp_p (rtx disp)
12293 bool saw_plus;
12295 /* In 64bit mode we can allow direct addresses of symbols and labels
12296 when they are not dynamic symbols. */
12297 if (TARGET_64BIT)
12299 rtx op0 = disp, op1;
12301 switch (GET_CODE (disp))
12303 case LABEL_REF:
12304 return true;
12306 case CONST:
12307 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12308 break;
12309 op0 = XEXP (XEXP (disp, 0), 0);
12310 op1 = XEXP (XEXP (disp, 0), 1);
12311 if (!CONST_INT_P (op1)
12312 || INTVAL (op1) >= 16*1024*1024
12313 || INTVAL (op1) < -16*1024*1024)
12314 break;
12315 if (GET_CODE (op0) == LABEL_REF)
12316 return true;
12317 if (GET_CODE (op0) == CONST
12318 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12319 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12320 return true;
12321 if (GET_CODE (op0) == UNSPEC
12322 && XINT (op0, 1) == UNSPEC_PCREL)
12323 return true;
12324 if (GET_CODE (op0) != SYMBOL_REF)
12325 break;
12326 /* FALLTHRU */
12328 case SYMBOL_REF:
12329 /* TLS references should always be enclosed in UNSPEC.
12330 The dllimported symbol needs always to be resolved. */
12331 if (SYMBOL_REF_TLS_MODEL (op0)
12332 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12333 return false;
12335 if (TARGET_PECOFF)
12337 if (is_imported_p (op0))
12338 return true;
12340 if (SYMBOL_REF_FAR_ADDR_P (op0)
12341 || !SYMBOL_REF_LOCAL_P (op0))
12342 break;
12344 /* Function-symbols need to be resolved only for
12345 large-model.
12346 For the small-model we don't need to resolve anything
12347 here. */
12348 if ((ix86_cmodel != CM_LARGE_PIC
12349 && SYMBOL_REF_FUNCTION_P (op0))
12350 || ix86_cmodel == CM_SMALL_PIC)
12351 return true;
12352 /* Non-external symbols don't need to be resolved for
12353 large, and medium-model. */
12354 if ((ix86_cmodel == CM_LARGE_PIC
12355 || ix86_cmodel == CM_MEDIUM_PIC)
12356 && !SYMBOL_REF_EXTERNAL_P (op0))
12357 return true;
12359 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12360 && SYMBOL_REF_LOCAL_P (op0)
12361 && ix86_cmodel != CM_LARGE_PIC)
12362 return true;
12363 break;
12365 default:
12366 break;
12369 if (GET_CODE (disp) != CONST)
12370 return false;
12371 disp = XEXP (disp, 0);
12373 if (TARGET_64BIT)
12375 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12376 of GOT tables. We should not need these anyway. */
12377 if (GET_CODE (disp) != UNSPEC
12378 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12379 && XINT (disp, 1) != UNSPEC_GOTOFF
12380 && XINT (disp, 1) != UNSPEC_PCREL
12381 && XINT (disp, 1) != UNSPEC_PLTOFF))
12382 return false;
12384 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12385 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12386 return false;
12387 return true;
12390 saw_plus = false;
12391 if (GET_CODE (disp) == PLUS)
12393 if (!CONST_INT_P (XEXP (disp, 1)))
12394 return false;
12395 disp = XEXP (disp, 0);
12396 saw_plus = true;
12399 if (TARGET_MACHO && darwin_local_data_pic (disp))
12400 return true;
12402 if (GET_CODE (disp) != UNSPEC)
12403 return false;
12405 switch (XINT (disp, 1))
12407 case UNSPEC_GOT:
12408 if (saw_plus)
12409 return false;
12410 /* We need to check for both symbols and labels because VxWorks loads
12411 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12412 details. */
12413 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12414 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12415 case UNSPEC_GOTOFF:
12416 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12417 While ABI specify also 32bit relocation but we don't produce it in
12418 small PIC model at all. */
12419 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12420 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12421 && !TARGET_64BIT)
12422 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12423 return false;
12424 case UNSPEC_GOTTPOFF:
12425 case UNSPEC_GOTNTPOFF:
12426 case UNSPEC_INDNTPOFF:
12427 if (saw_plus)
12428 return false;
12429 disp = XVECEXP (disp, 0, 0);
12430 return (GET_CODE (disp) == SYMBOL_REF
12431 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12432 case UNSPEC_NTPOFF:
12433 disp = XVECEXP (disp, 0, 0);
12434 return (GET_CODE (disp) == SYMBOL_REF
12435 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12436 case UNSPEC_DTPOFF:
12437 disp = XVECEXP (disp, 0, 0);
12438 return (GET_CODE (disp) == SYMBOL_REF
12439 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12442 return false;
12445 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12446 replace the input X, or the original X if no replacement is called for.
12447 The output parameter *WIN is 1 if the calling macro should goto WIN,
12448 0 if it should not. */
12450 bool
12451 ix86_legitimize_reload_address (rtx x,
12452 enum machine_mode mode ATTRIBUTE_UNUSED,
12453 int opnum, int type,
12454 int ind_levels ATTRIBUTE_UNUSED)
12456 /* Reload can generate:
12458 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12459 (reg:DI 97))
12460 (reg:DI 2 cx))
12462 This RTX is rejected from ix86_legitimate_address_p due to
12463 non-strictness of base register 97. Following this rejection,
12464 reload pushes all three components into separate registers,
12465 creating invalid memory address RTX.
12467 Following code reloads only the invalid part of the
12468 memory address RTX. */
12470 if (GET_CODE (x) == PLUS
12471 && REG_P (XEXP (x, 1))
12472 && GET_CODE (XEXP (x, 0)) == PLUS
12473 && REG_P (XEXP (XEXP (x, 0), 1)))
12475 rtx base, index;
12476 bool something_reloaded = false;
12478 base = XEXP (XEXP (x, 0), 1);
12479 if (!REG_OK_FOR_BASE_STRICT_P (base))
12481 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12482 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12483 opnum, (enum reload_type) type);
12484 something_reloaded = true;
12487 index = XEXP (x, 1);
12488 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12490 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12491 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12492 opnum, (enum reload_type) type);
12493 something_reloaded = true;
12496 gcc_assert (something_reloaded);
12497 return true;
12500 return false;
12503 /* Recognizes RTL expressions that are valid memory addresses for an
12504 instruction. The MODE argument is the machine mode for the MEM
12505 expression that wants to use this address.
12507 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12508 convert common non-canonical forms to canonical form so that they will
12509 be recognized. */
12511 static bool
12512 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12513 rtx addr, bool strict)
12515 struct ix86_address parts;
12516 rtx base, index, disp;
12517 HOST_WIDE_INT scale;
12519 if (ix86_decompose_address (addr, &parts) <= 0)
12520 /* Decomposition failed. */
12521 return false;
12523 base = parts.base;
12524 index = parts.index;
12525 disp = parts.disp;
12526 scale = parts.scale;
12528 /* Validate base register. */
12529 if (base)
12531 rtx reg;
12533 if (REG_P (base))
12534 reg = base;
12535 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12536 reg = SUBREG_REG (base);
12537 else
12538 /* Base is not a register. */
12539 return false;
12541 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12542 return false;
12544 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12545 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12546 /* Base is not valid. */
12547 return false;
12550 /* Validate index register. */
12551 if (index)
12553 rtx reg;
12555 if (REG_P (index))
12556 reg = index;
12557 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12558 reg = SUBREG_REG (index);
12559 else
12560 /* Index is not a register. */
12561 return false;
12563 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12564 return false;
12566 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12567 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12568 /* Index is not valid. */
12569 return false;
12572 /* Index and base should have the same mode. */
12573 if (base && index
12574 && GET_MODE (base) != GET_MODE (index))
12575 return false;
12577 /* Validate scale factor. */
12578 if (scale != 1)
12580 if (!index)
12581 /* Scale without index. */
12582 return false;
12584 if (scale != 2 && scale != 4 && scale != 8)
12585 /* Scale is not a valid multiplier. */
12586 return false;
12589 /* Validate displacement. */
12590 if (disp)
12592 if (GET_CODE (disp) == CONST
12593 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12594 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12595 switch (XINT (XEXP (disp, 0), 1))
12597 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12598 used. While ABI specify also 32bit relocations, we don't produce
12599 them at all and use IP relative instead. */
12600 case UNSPEC_GOT:
12601 case UNSPEC_GOTOFF:
12602 gcc_assert (flag_pic);
12603 if (!TARGET_64BIT)
12604 goto is_legitimate_pic;
12606 /* 64bit address unspec. */
12607 return false;
12609 case UNSPEC_GOTPCREL:
12610 case UNSPEC_PCREL:
12611 gcc_assert (flag_pic);
12612 goto is_legitimate_pic;
12614 case UNSPEC_GOTTPOFF:
12615 case UNSPEC_GOTNTPOFF:
12616 case UNSPEC_INDNTPOFF:
12617 case UNSPEC_NTPOFF:
12618 case UNSPEC_DTPOFF:
12619 break;
12621 case UNSPEC_STACK_CHECK:
12622 gcc_assert (flag_split_stack);
12623 break;
12625 default:
12626 /* Invalid address unspec. */
12627 return false;
12630 else if (SYMBOLIC_CONST (disp)
12631 && (flag_pic
12632 || (TARGET_MACHO
12633 #if TARGET_MACHO
12634 && MACHOPIC_INDIRECT
12635 && !machopic_operand_p (disp)
12636 #endif
12640 is_legitimate_pic:
12641 if (TARGET_64BIT && (index || base))
12643 /* foo@dtpoff(%rX) is ok. */
12644 if (GET_CODE (disp) != CONST
12645 || GET_CODE (XEXP (disp, 0)) != PLUS
12646 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12647 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12648 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12649 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12650 /* Non-constant pic memory reference. */
12651 return false;
12653 else if ((!TARGET_MACHO || flag_pic)
12654 && ! legitimate_pic_address_disp_p (disp))
12655 /* Displacement is an invalid pic construct. */
12656 return false;
12657 #if TARGET_MACHO
12658 else if (MACHO_DYNAMIC_NO_PIC_P
12659 && !ix86_legitimate_constant_p (Pmode, disp))
12660 /* displacment must be referenced via non_lazy_pointer */
12661 return false;
12662 #endif
12664 /* This code used to verify that a symbolic pic displacement
12665 includes the pic_offset_table_rtx register.
12667 While this is good idea, unfortunately these constructs may
12668 be created by "adds using lea" optimization for incorrect
12669 code like:
12671 int a;
12672 int foo(int i)
12674 return *(&a+i);
12677 This code is nonsensical, but results in addressing
12678 GOT table with pic_offset_table_rtx base. We can't
12679 just refuse it easily, since it gets matched by
12680 "addsi3" pattern, that later gets split to lea in the
12681 case output register differs from input. While this
12682 can be handled by separate addsi pattern for this case
12683 that never results in lea, this seems to be easier and
12684 correct fix for crash to disable this test. */
12686 else if (GET_CODE (disp) != LABEL_REF
12687 && !CONST_INT_P (disp)
12688 && (GET_CODE (disp) != CONST
12689 || !ix86_legitimate_constant_p (Pmode, disp))
12690 && (GET_CODE (disp) != SYMBOL_REF
12691 || !ix86_legitimate_constant_p (Pmode, disp)))
12692 /* Displacement is not constant. */
12693 return false;
12694 else if (TARGET_64BIT
12695 && !x86_64_immediate_operand (disp, VOIDmode))
12696 /* Displacement is out of range. */
12697 return false;
12700 /* Everything looks valid. */
12701 return true;
12704 /* Determine if a given RTX is a valid constant address. */
12706 bool
12707 constant_address_p (rtx x)
12709 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12712 /* Return a unique alias set for the GOT. */
12714 static alias_set_type
12715 ix86_GOT_alias_set (void)
12717 static alias_set_type set = -1;
12718 if (set == -1)
12719 set = new_alias_set ();
12720 return set;
12723 /* Return a legitimate reference for ORIG (an address) using the
12724 register REG. If REG is 0, a new pseudo is generated.
12726 There are two types of references that must be handled:
12728 1. Global data references must load the address from the GOT, via
12729 the PIC reg. An insn is emitted to do this load, and the reg is
12730 returned.
12732 2. Static data references, constant pool addresses, and code labels
12733 compute the address as an offset from the GOT, whose base is in
12734 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12735 differentiate them from global data objects. The returned
12736 address is the PIC reg + an unspec constant.
12738 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12739 reg also appears in the address. */
12741 static rtx
12742 legitimize_pic_address (rtx orig, rtx reg)
12744 rtx addr = orig;
12745 rtx new_rtx = orig;
12747 #if TARGET_MACHO
12748 if (TARGET_MACHO && !TARGET_64BIT)
12750 if (reg == 0)
12751 reg = gen_reg_rtx (Pmode);
12752 /* Use the generic Mach-O PIC machinery. */
12753 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12755 #endif
12757 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12759 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12760 if (tmp)
12761 return tmp;
12764 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12765 new_rtx = addr;
12766 else if (TARGET_64BIT && !TARGET_PECOFF
12767 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12769 rtx tmpreg;
12770 /* This symbol may be referenced via a displacement from the PIC
12771 base address (@GOTOFF). */
12773 if (reload_in_progress)
12774 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12775 if (GET_CODE (addr) == CONST)
12776 addr = XEXP (addr, 0);
12777 if (GET_CODE (addr) == PLUS)
12779 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12780 UNSPEC_GOTOFF);
12781 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12783 else
12784 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12785 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12786 if (!reg)
12787 tmpreg = gen_reg_rtx (Pmode);
12788 else
12789 tmpreg = reg;
12790 emit_move_insn (tmpreg, new_rtx);
12792 if (reg != 0)
12794 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12795 tmpreg, 1, OPTAB_DIRECT);
12796 new_rtx = reg;
12798 else
12799 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12801 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12803 /* This symbol may be referenced via a displacement from the PIC
12804 base address (@GOTOFF). */
12806 if (reload_in_progress)
12807 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12808 if (GET_CODE (addr) == CONST)
12809 addr = XEXP (addr, 0);
12810 if (GET_CODE (addr) == PLUS)
12812 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12813 UNSPEC_GOTOFF);
12814 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12816 else
12817 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12818 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12819 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12821 if (reg != 0)
12823 emit_move_insn (reg, new_rtx);
12824 new_rtx = reg;
12827 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12828 /* We can't use @GOTOFF for text labels on VxWorks;
12829 see gotoff_operand. */
12830 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12832 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12833 if (tmp)
12834 return tmp;
12836 /* For x64 PE-COFF there is no GOT table. So we use address
12837 directly. */
12838 if (TARGET_64BIT && TARGET_PECOFF)
12840 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12841 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12843 if (reg == 0)
12844 reg = gen_reg_rtx (Pmode);
12845 emit_move_insn (reg, new_rtx);
12846 new_rtx = reg;
12848 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12850 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12851 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12852 new_rtx = gen_const_mem (Pmode, new_rtx);
12853 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12855 if (reg == 0)
12856 reg = gen_reg_rtx (Pmode);
12857 /* Use directly gen_movsi, otherwise the address is loaded
12858 into register for CSE. We don't want to CSE this addresses,
12859 instead we CSE addresses from the GOT table, so skip this. */
12860 emit_insn (gen_movsi (reg, new_rtx));
12861 new_rtx = reg;
12863 else
12865 /* This symbol must be referenced via a load from the
12866 Global Offset Table (@GOT). */
12868 if (reload_in_progress)
12869 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12870 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12871 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12872 if (TARGET_64BIT)
12873 new_rtx = force_reg (Pmode, new_rtx);
12874 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12875 new_rtx = gen_const_mem (Pmode, new_rtx);
12876 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12878 if (reg == 0)
12879 reg = gen_reg_rtx (Pmode);
12880 emit_move_insn (reg, new_rtx);
12881 new_rtx = reg;
12884 else
12886 if (CONST_INT_P (addr)
12887 && !x86_64_immediate_operand (addr, VOIDmode))
12889 if (reg)
12891 emit_move_insn (reg, addr);
12892 new_rtx = reg;
12894 else
12895 new_rtx = force_reg (Pmode, addr);
12897 else if (GET_CODE (addr) == CONST)
12899 addr = XEXP (addr, 0);
12901 /* We must match stuff we generate before. Assume the only
12902 unspecs that can get here are ours. Not that we could do
12903 anything with them anyway.... */
12904 if (GET_CODE (addr) == UNSPEC
12905 || (GET_CODE (addr) == PLUS
12906 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12907 return orig;
12908 gcc_assert (GET_CODE (addr) == PLUS);
12910 if (GET_CODE (addr) == PLUS)
12912 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12914 /* Check first to see if this is a constant offset from a @GOTOFF
12915 symbol reference. */
12916 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12917 && CONST_INT_P (op1))
12919 if (!TARGET_64BIT)
12921 if (reload_in_progress)
12922 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12923 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12924 UNSPEC_GOTOFF);
12925 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12926 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12927 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12929 if (reg != 0)
12931 emit_move_insn (reg, new_rtx);
12932 new_rtx = reg;
12935 else
12937 if (INTVAL (op1) < -16*1024*1024
12938 || INTVAL (op1) >= 16*1024*1024)
12940 if (!x86_64_immediate_operand (op1, Pmode))
12941 op1 = force_reg (Pmode, op1);
12942 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12946 else
12948 rtx base = legitimize_pic_address (op0, reg);
12949 enum machine_mode mode = GET_MODE (base);
12950 new_rtx
12951 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12953 if (CONST_INT_P (new_rtx))
12955 if (INTVAL (new_rtx) < -16*1024*1024
12956 || INTVAL (new_rtx) >= 16*1024*1024)
12958 if (!x86_64_immediate_operand (new_rtx, mode))
12959 new_rtx = force_reg (mode, new_rtx);
12960 new_rtx
12961 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12963 else
12964 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12966 else
12968 if (GET_CODE (new_rtx) == PLUS
12969 && CONSTANT_P (XEXP (new_rtx, 1)))
12971 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12972 new_rtx = XEXP (new_rtx, 1);
12974 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12979 return new_rtx;
12982 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12984 static rtx
12985 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12987 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12989 if (GET_MODE (tp) != tp_mode)
12991 gcc_assert (GET_MODE (tp) == SImode);
12992 gcc_assert (tp_mode == DImode);
12994 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12997 if (to_reg)
12998 tp = copy_to_mode_reg (tp_mode, tp);
13000 return tp;
13003 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13005 static GTY(()) rtx ix86_tls_symbol;
13007 static rtx
13008 ix86_tls_get_addr (void)
13010 if (!ix86_tls_symbol)
13012 const char *sym
13013 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13014 ? "___tls_get_addr" : "__tls_get_addr");
13016 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13019 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13021 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13022 UNSPEC_PLTOFF);
13023 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13024 gen_rtx_CONST (Pmode, unspec));
13027 return ix86_tls_symbol;
13030 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13032 static GTY(()) rtx ix86_tls_module_base_symbol;
13035 ix86_tls_module_base (void)
13037 if (!ix86_tls_module_base_symbol)
13039 ix86_tls_module_base_symbol
13040 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13042 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13043 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13046 return ix86_tls_module_base_symbol;
13049 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13050 false if we expect this to be used for a memory address and true if
13051 we expect to load the address into a register. */
13053 static rtx
13054 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13056 rtx dest, base, off;
13057 rtx pic = NULL_RTX, tp = NULL_RTX;
13058 enum machine_mode tp_mode = Pmode;
13059 int type;
13061 switch (model)
13063 case TLS_MODEL_GLOBAL_DYNAMIC:
13064 dest = gen_reg_rtx (Pmode);
13066 if (!TARGET_64BIT)
13068 if (flag_pic && !TARGET_PECOFF)
13069 pic = pic_offset_table_rtx;
13070 else
13072 pic = gen_reg_rtx (Pmode);
13073 emit_insn (gen_set_got (pic));
13077 if (TARGET_GNU2_TLS)
13079 if (TARGET_64BIT)
13080 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13081 else
13082 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13084 tp = get_thread_pointer (Pmode, true);
13085 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13087 if (GET_MODE (x) != Pmode)
13088 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13090 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13092 else
13094 rtx caddr = ix86_tls_get_addr ();
13096 if (TARGET_64BIT)
13098 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13099 rtx insns;
13101 start_sequence ();
13102 emit_call_insn
13103 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13104 insns = get_insns ();
13105 end_sequence ();
13107 if (GET_MODE (x) != Pmode)
13108 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13110 RTL_CONST_CALL_P (insns) = 1;
13111 emit_libcall_block (insns, dest, rax, x);
13113 else
13114 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13116 break;
13118 case TLS_MODEL_LOCAL_DYNAMIC:
13119 base = gen_reg_rtx (Pmode);
13121 if (!TARGET_64BIT)
13123 if (flag_pic)
13124 pic = pic_offset_table_rtx;
13125 else
13127 pic = gen_reg_rtx (Pmode);
13128 emit_insn (gen_set_got (pic));
13132 if (TARGET_GNU2_TLS)
13134 rtx tmp = ix86_tls_module_base ();
13136 if (TARGET_64BIT)
13137 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13138 else
13139 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13141 tp = get_thread_pointer (Pmode, true);
13142 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13143 gen_rtx_MINUS (Pmode, tmp, tp));
13145 else
13147 rtx caddr = ix86_tls_get_addr ();
13149 if (TARGET_64BIT)
13151 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13152 rtx insns, eqv;
13154 start_sequence ();
13155 emit_call_insn
13156 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13157 insns = get_insns ();
13158 end_sequence ();
13160 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13161 share the LD_BASE result with other LD model accesses. */
13162 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13163 UNSPEC_TLS_LD_BASE);
13165 RTL_CONST_CALL_P (insns) = 1;
13166 emit_libcall_block (insns, base, rax, eqv);
13168 else
13169 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13172 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13173 off = gen_rtx_CONST (Pmode, off);
13175 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13177 if (TARGET_GNU2_TLS)
13179 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13181 if (GET_MODE (x) != Pmode)
13182 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13184 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13186 break;
13188 case TLS_MODEL_INITIAL_EXEC:
13189 if (TARGET_64BIT)
13191 if (TARGET_SUN_TLS && !TARGET_X32)
13193 /* The Sun linker took the AMD64 TLS spec literally
13194 and can only handle %rax as destination of the
13195 initial executable code sequence. */
13197 dest = gen_reg_rtx (DImode);
13198 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13199 return dest;
13202 /* Generate DImode references to avoid %fs:(%reg32)
13203 problems and linker IE->LE relaxation bug. */
13204 tp_mode = DImode;
13205 pic = NULL;
13206 type = UNSPEC_GOTNTPOFF;
13208 else if (flag_pic)
13210 if (reload_in_progress)
13211 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13212 pic = pic_offset_table_rtx;
13213 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13215 else if (!TARGET_ANY_GNU_TLS)
13217 pic = gen_reg_rtx (Pmode);
13218 emit_insn (gen_set_got (pic));
13219 type = UNSPEC_GOTTPOFF;
13221 else
13223 pic = NULL;
13224 type = UNSPEC_INDNTPOFF;
13227 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13228 off = gen_rtx_CONST (tp_mode, off);
13229 if (pic)
13230 off = gen_rtx_PLUS (tp_mode, pic, off);
13231 off = gen_const_mem (tp_mode, off);
13232 set_mem_alias_set (off, ix86_GOT_alias_set ());
13234 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13236 base = get_thread_pointer (tp_mode,
13237 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13238 off = force_reg (tp_mode, off);
13239 return gen_rtx_PLUS (tp_mode, base, off);
13241 else
13243 base = get_thread_pointer (Pmode, true);
13244 dest = gen_reg_rtx (Pmode);
13245 emit_insn (ix86_gen_sub3 (dest, base, off));
13247 break;
13249 case TLS_MODEL_LOCAL_EXEC:
13250 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13251 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13252 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13253 off = gen_rtx_CONST (Pmode, off);
13255 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13257 base = get_thread_pointer (Pmode,
13258 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13259 return gen_rtx_PLUS (Pmode, base, off);
13261 else
13263 base = get_thread_pointer (Pmode, true);
13264 dest = gen_reg_rtx (Pmode);
13265 emit_insn (ix86_gen_sub3 (dest, base, off));
13267 break;
13269 default:
13270 gcc_unreachable ();
13273 return dest;
13276 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13277 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13278 unique refptr-DECL symbol corresponding to symbol DECL. */
13280 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13281 htab_t dllimport_map;
13283 static tree
13284 get_dllimport_decl (tree decl, bool beimport)
13286 struct tree_map *h, in;
13287 void **loc;
13288 const char *name;
13289 const char *prefix;
13290 size_t namelen, prefixlen;
13291 char *imp_name;
13292 tree to;
13293 rtx rtl;
13295 if (!dllimport_map)
13296 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13298 in.hash = htab_hash_pointer (decl);
13299 in.base.from = decl;
13300 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13301 h = (struct tree_map *) *loc;
13302 if (h)
13303 return h->to;
13305 *loc = h = ggc_alloc_tree_map ();
13306 h->hash = in.hash;
13307 h->base.from = decl;
13308 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13309 VAR_DECL, NULL, ptr_type_node);
13310 DECL_ARTIFICIAL (to) = 1;
13311 DECL_IGNORED_P (to) = 1;
13312 DECL_EXTERNAL (to) = 1;
13313 TREE_READONLY (to) = 1;
13315 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13316 name = targetm.strip_name_encoding (name);
13317 if (beimport)
13318 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13319 ? "*__imp_" : "*__imp__";
13320 else
13321 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13322 namelen = strlen (name);
13323 prefixlen = strlen (prefix);
13324 imp_name = (char *) alloca (namelen + prefixlen + 1);
13325 memcpy (imp_name, prefix, prefixlen);
13326 memcpy (imp_name + prefixlen, name, namelen + 1);
13328 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13329 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13330 SET_SYMBOL_REF_DECL (rtl, to);
13331 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13332 if (!beimport)
13334 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13335 #ifdef SUB_TARGET_RECORD_STUB
13336 SUB_TARGET_RECORD_STUB (name);
13337 #endif
13340 rtl = gen_const_mem (Pmode, rtl);
13341 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13343 SET_DECL_RTL (to, rtl);
13344 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13346 return to;
13349 /* Expand SYMBOL into its corresponding far-addresse symbol.
13350 WANT_REG is true if we require the result be a register. */
13352 static rtx
13353 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13355 tree imp_decl;
13356 rtx x;
13358 gcc_assert (SYMBOL_REF_DECL (symbol));
13359 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13361 x = DECL_RTL (imp_decl);
13362 if (want_reg)
13363 x = force_reg (Pmode, x);
13364 return x;
13367 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13368 true if we require the result be a register. */
13370 static rtx
13371 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13373 tree imp_decl;
13374 rtx x;
13376 gcc_assert (SYMBOL_REF_DECL (symbol));
13377 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13379 x = DECL_RTL (imp_decl);
13380 if (want_reg)
13381 x = force_reg (Pmode, x);
13382 return x;
13385 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13386 is true if we require the result be a register. */
13388 static rtx
13389 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13391 if (!TARGET_PECOFF)
13392 return NULL_RTX;
13394 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13396 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13397 return legitimize_dllimport_symbol (addr, inreg);
13398 if (GET_CODE (addr) == CONST
13399 && GET_CODE (XEXP (addr, 0)) == PLUS
13400 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13401 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13403 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13404 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13408 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13409 return NULL_RTX;
13410 if (GET_CODE (addr) == SYMBOL_REF
13411 && !is_imported_p (addr)
13412 && SYMBOL_REF_EXTERNAL_P (addr)
13413 && SYMBOL_REF_DECL (addr))
13414 return legitimize_pe_coff_extern_decl (addr, inreg);
13416 if (GET_CODE (addr) == CONST
13417 && GET_CODE (XEXP (addr, 0)) == PLUS
13418 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13419 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13420 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13421 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13423 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13424 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13426 return NULL_RTX;
13429 /* Try machine-dependent ways of modifying an illegitimate address
13430 to be legitimate. If we find one, return the new, valid address.
13431 This macro is used in only one place: `memory_address' in explow.c.
13433 OLDX is the address as it was before break_out_memory_refs was called.
13434 In some cases it is useful to look at this to decide what needs to be done.
13436 It is always safe for this macro to do nothing. It exists to recognize
13437 opportunities to optimize the output.
13439 For the 80386, we handle X+REG by loading X into a register R and
13440 using R+REG. R will go in a general reg and indexing will be used.
13441 However, if REG is a broken-out memory address or multiplication,
13442 nothing needs to be done because REG can certainly go in a general reg.
13444 When -fpic is used, special handling is needed for symbolic references.
13445 See comments by legitimize_pic_address in i386.c for details. */
13447 static rtx
13448 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13449 enum machine_mode mode)
13451 int changed = 0;
13452 unsigned log;
13454 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13455 if (log)
13456 return legitimize_tls_address (x, (enum tls_model) log, false);
13457 if (GET_CODE (x) == CONST
13458 && GET_CODE (XEXP (x, 0)) == PLUS
13459 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13460 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13462 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13463 (enum tls_model) log, false);
13464 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13467 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13469 rtx tmp = legitimize_pe_coff_symbol (x, true);
13470 if (tmp)
13471 return tmp;
13474 if (flag_pic && SYMBOLIC_CONST (x))
13475 return legitimize_pic_address (x, 0);
13477 #if TARGET_MACHO
13478 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13479 return machopic_indirect_data_reference (x, 0);
13480 #endif
13482 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13483 if (GET_CODE (x) == ASHIFT
13484 && CONST_INT_P (XEXP (x, 1))
13485 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13487 changed = 1;
13488 log = INTVAL (XEXP (x, 1));
13489 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13490 GEN_INT (1 << log));
13493 if (GET_CODE (x) == PLUS)
13495 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13497 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13498 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13499 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13501 changed = 1;
13502 log = INTVAL (XEXP (XEXP (x, 0), 1));
13503 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13504 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13505 GEN_INT (1 << log));
13508 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13509 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13510 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13512 changed = 1;
13513 log = INTVAL (XEXP (XEXP (x, 1), 1));
13514 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13515 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13516 GEN_INT (1 << log));
13519 /* Put multiply first if it isn't already. */
13520 if (GET_CODE (XEXP (x, 1)) == MULT)
13522 rtx tmp = XEXP (x, 0);
13523 XEXP (x, 0) = XEXP (x, 1);
13524 XEXP (x, 1) = tmp;
13525 changed = 1;
13528 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13529 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13530 created by virtual register instantiation, register elimination, and
13531 similar optimizations. */
13532 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13534 changed = 1;
13535 x = gen_rtx_PLUS (Pmode,
13536 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13537 XEXP (XEXP (x, 1), 0)),
13538 XEXP (XEXP (x, 1), 1));
13541 /* Canonicalize
13542 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13543 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13544 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13545 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13546 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13547 && CONSTANT_P (XEXP (x, 1)))
13549 rtx constant;
13550 rtx other = NULL_RTX;
13552 if (CONST_INT_P (XEXP (x, 1)))
13554 constant = XEXP (x, 1);
13555 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13557 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13559 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13560 other = XEXP (x, 1);
13562 else
13563 constant = 0;
13565 if (constant)
13567 changed = 1;
13568 x = gen_rtx_PLUS (Pmode,
13569 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13570 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13571 plus_constant (Pmode, other,
13572 INTVAL (constant)));
13576 if (changed && ix86_legitimate_address_p (mode, x, false))
13577 return x;
13579 if (GET_CODE (XEXP (x, 0)) == MULT)
13581 changed = 1;
13582 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13585 if (GET_CODE (XEXP (x, 1)) == MULT)
13587 changed = 1;
13588 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13591 if (changed
13592 && REG_P (XEXP (x, 1))
13593 && REG_P (XEXP (x, 0)))
13594 return x;
13596 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13598 changed = 1;
13599 x = legitimize_pic_address (x, 0);
13602 if (changed && ix86_legitimate_address_p (mode, x, false))
13603 return x;
13605 if (REG_P (XEXP (x, 0)))
13607 rtx temp = gen_reg_rtx (Pmode);
13608 rtx val = force_operand (XEXP (x, 1), temp);
13609 if (val != temp)
13611 val = convert_to_mode (Pmode, val, 1);
13612 emit_move_insn (temp, val);
13615 XEXP (x, 1) = temp;
13616 return x;
13619 else if (REG_P (XEXP (x, 1)))
13621 rtx temp = gen_reg_rtx (Pmode);
13622 rtx val = force_operand (XEXP (x, 0), temp);
13623 if (val != temp)
13625 val = convert_to_mode (Pmode, val, 1);
13626 emit_move_insn (temp, val);
13629 XEXP (x, 0) = temp;
13630 return x;
13634 return x;
13637 /* Print an integer constant expression in assembler syntax. Addition
13638 and subtraction are the only arithmetic that may appear in these
13639 expressions. FILE is the stdio stream to write to, X is the rtx, and
13640 CODE is the operand print code from the output string. */
13642 static void
13643 output_pic_addr_const (FILE *file, rtx x, int code)
13645 char buf[256];
13647 switch (GET_CODE (x))
13649 case PC:
13650 gcc_assert (flag_pic);
13651 putc ('.', file);
13652 break;
13654 case SYMBOL_REF:
13655 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13656 output_addr_const (file, x);
13657 else
13659 const char *name = XSTR (x, 0);
13661 /* Mark the decl as referenced so that cgraph will
13662 output the function. */
13663 if (SYMBOL_REF_DECL (x))
13664 mark_decl_referenced (SYMBOL_REF_DECL (x));
13666 #if TARGET_MACHO
13667 if (MACHOPIC_INDIRECT
13668 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13669 name = machopic_indirection_name (x, /*stub_p=*/true);
13670 #endif
13671 assemble_name (file, name);
13673 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13674 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13675 fputs ("@PLT", file);
13676 break;
13678 case LABEL_REF:
13679 x = XEXP (x, 0);
13680 /* FALLTHRU */
13681 case CODE_LABEL:
13682 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13683 assemble_name (asm_out_file, buf);
13684 break;
13686 case CONST_INT:
13687 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13688 break;
13690 case CONST:
13691 /* This used to output parentheses around the expression,
13692 but that does not work on the 386 (either ATT or BSD assembler). */
13693 output_pic_addr_const (file, XEXP (x, 0), code);
13694 break;
13696 case CONST_DOUBLE:
13697 if (GET_MODE (x) == VOIDmode)
13699 /* We can use %d if the number is <32 bits and positive. */
13700 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13701 fprintf (file, "0x%lx%08lx",
13702 (unsigned long) CONST_DOUBLE_HIGH (x),
13703 (unsigned long) CONST_DOUBLE_LOW (x));
13704 else
13705 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13707 else
13708 /* We can't handle floating point constants;
13709 TARGET_PRINT_OPERAND must handle them. */
13710 output_operand_lossage ("floating constant misused");
13711 break;
13713 case PLUS:
13714 /* Some assemblers need integer constants to appear first. */
13715 if (CONST_INT_P (XEXP (x, 0)))
13717 output_pic_addr_const (file, XEXP (x, 0), code);
13718 putc ('+', file);
13719 output_pic_addr_const (file, XEXP (x, 1), code);
13721 else
13723 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13724 output_pic_addr_const (file, XEXP (x, 1), code);
13725 putc ('+', file);
13726 output_pic_addr_const (file, XEXP (x, 0), code);
13728 break;
13730 case MINUS:
13731 if (!TARGET_MACHO)
13732 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13733 output_pic_addr_const (file, XEXP (x, 0), code);
13734 putc ('-', file);
13735 output_pic_addr_const (file, XEXP (x, 1), code);
13736 if (!TARGET_MACHO)
13737 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13738 break;
13740 case UNSPEC:
13741 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13743 bool f = i386_asm_output_addr_const_extra (file, x);
13744 gcc_assert (f);
13745 break;
13748 gcc_assert (XVECLEN (x, 0) == 1);
13749 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13750 switch (XINT (x, 1))
13752 case UNSPEC_GOT:
13753 fputs ("@GOT", file);
13754 break;
13755 case UNSPEC_GOTOFF:
13756 fputs ("@GOTOFF", file);
13757 break;
13758 case UNSPEC_PLTOFF:
13759 fputs ("@PLTOFF", file);
13760 break;
13761 case UNSPEC_PCREL:
13762 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13763 "(%rip)" : "[rip]", file);
13764 break;
13765 case UNSPEC_GOTPCREL:
13766 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13767 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13768 break;
13769 case UNSPEC_GOTTPOFF:
13770 /* FIXME: This might be @TPOFF in Sun ld too. */
13771 fputs ("@gottpoff", file);
13772 break;
13773 case UNSPEC_TPOFF:
13774 fputs ("@tpoff", file);
13775 break;
13776 case UNSPEC_NTPOFF:
13777 if (TARGET_64BIT)
13778 fputs ("@tpoff", file);
13779 else
13780 fputs ("@ntpoff", file);
13781 break;
13782 case UNSPEC_DTPOFF:
13783 fputs ("@dtpoff", file);
13784 break;
13785 case UNSPEC_GOTNTPOFF:
13786 if (TARGET_64BIT)
13787 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13788 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13789 else
13790 fputs ("@gotntpoff", file);
13791 break;
13792 case UNSPEC_INDNTPOFF:
13793 fputs ("@indntpoff", file);
13794 break;
13795 #if TARGET_MACHO
13796 case UNSPEC_MACHOPIC_OFFSET:
13797 putc ('-', file);
13798 machopic_output_function_base_name (file);
13799 break;
13800 #endif
13801 default:
13802 output_operand_lossage ("invalid UNSPEC as operand");
13803 break;
13805 break;
13807 default:
13808 output_operand_lossage ("invalid expression as operand");
13812 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13813 We need to emit DTP-relative relocations. */
13815 static void ATTRIBUTE_UNUSED
13816 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13818 fputs (ASM_LONG, file);
13819 output_addr_const (file, x);
13820 fputs ("@dtpoff", file);
13821 switch (size)
13823 case 4:
13824 break;
13825 case 8:
13826 fputs (", 0", file);
13827 break;
13828 default:
13829 gcc_unreachable ();
13833 /* Return true if X is a representation of the PIC register. This copes
13834 with calls from ix86_find_base_term, where the register might have
13835 been replaced by a cselib value. */
13837 static bool
13838 ix86_pic_register_p (rtx x)
13840 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13841 return (pic_offset_table_rtx
13842 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13843 else
13844 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13847 /* Helper function for ix86_delegitimize_address.
13848 Attempt to delegitimize TLS local-exec accesses. */
13850 static rtx
13851 ix86_delegitimize_tls_address (rtx orig_x)
13853 rtx x = orig_x, unspec;
13854 struct ix86_address addr;
13856 if (!TARGET_TLS_DIRECT_SEG_REFS)
13857 return orig_x;
13858 if (MEM_P (x))
13859 x = XEXP (x, 0);
13860 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13861 return orig_x;
13862 if (ix86_decompose_address (x, &addr) == 0
13863 || addr.seg != DEFAULT_TLS_SEG_REG
13864 || addr.disp == NULL_RTX
13865 || GET_CODE (addr.disp) != CONST)
13866 return orig_x;
13867 unspec = XEXP (addr.disp, 0);
13868 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13869 unspec = XEXP (unspec, 0);
13870 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13871 return orig_x;
13872 x = XVECEXP (unspec, 0, 0);
13873 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13874 if (unspec != XEXP (addr.disp, 0))
13875 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13876 if (addr.index)
13878 rtx idx = addr.index;
13879 if (addr.scale != 1)
13880 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13881 x = gen_rtx_PLUS (Pmode, idx, x);
13883 if (addr.base)
13884 x = gen_rtx_PLUS (Pmode, addr.base, x);
13885 if (MEM_P (orig_x))
13886 x = replace_equiv_address_nv (orig_x, x);
13887 return x;
13890 /* In the name of slightly smaller debug output, and to cater to
13891 general assembler lossage, recognize PIC+GOTOFF and turn it back
13892 into a direct symbol reference.
13894 On Darwin, this is necessary to avoid a crash, because Darwin
13895 has a different PIC label for each routine but the DWARF debugging
13896 information is not associated with any particular routine, so it's
13897 necessary to remove references to the PIC label from RTL stored by
13898 the DWARF output code. */
13900 static rtx
13901 ix86_delegitimize_address (rtx x)
13903 rtx orig_x = delegitimize_mem_from_attrs (x);
13904 /* addend is NULL or some rtx if x is something+GOTOFF where
13905 something doesn't include the PIC register. */
13906 rtx addend = NULL_RTX;
13907 /* reg_addend is NULL or a multiple of some register. */
13908 rtx reg_addend = NULL_RTX;
13909 /* const_addend is NULL or a const_int. */
13910 rtx const_addend = NULL_RTX;
13911 /* This is the result, or NULL. */
13912 rtx result = NULL_RTX;
13914 x = orig_x;
13916 if (MEM_P (x))
13917 x = XEXP (x, 0);
13919 if (TARGET_64BIT)
13921 if (GET_CODE (x) == CONST
13922 && GET_CODE (XEXP (x, 0)) == PLUS
13923 && GET_MODE (XEXP (x, 0)) == Pmode
13924 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13925 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13926 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13928 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13929 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13930 if (MEM_P (orig_x))
13931 x = replace_equiv_address_nv (orig_x, x);
13932 return x;
13935 if (GET_CODE (x) == CONST
13936 && GET_CODE (XEXP (x, 0)) == UNSPEC
13937 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13938 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13939 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13941 x = XVECEXP (XEXP (x, 0), 0, 0);
13942 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13944 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13945 GET_MODE (x), 0);
13946 if (x == NULL_RTX)
13947 return orig_x;
13949 return x;
13952 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13953 return ix86_delegitimize_tls_address (orig_x);
13955 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13956 and -mcmodel=medium -fpic. */
13959 if (GET_CODE (x) != PLUS
13960 || GET_CODE (XEXP (x, 1)) != CONST)
13961 return ix86_delegitimize_tls_address (orig_x);
13963 if (ix86_pic_register_p (XEXP (x, 0)))
13964 /* %ebx + GOT/GOTOFF */
13966 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13968 /* %ebx + %reg * scale + GOT/GOTOFF */
13969 reg_addend = XEXP (x, 0);
13970 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13971 reg_addend = XEXP (reg_addend, 1);
13972 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13973 reg_addend = XEXP (reg_addend, 0);
13974 else
13976 reg_addend = NULL_RTX;
13977 addend = XEXP (x, 0);
13980 else
13981 addend = XEXP (x, 0);
13983 x = XEXP (XEXP (x, 1), 0);
13984 if (GET_CODE (x) == PLUS
13985 && CONST_INT_P (XEXP (x, 1)))
13987 const_addend = XEXP (x, 1);
13988 x = XEXP (x, 0);
13991 if (GET_CODE (x) == UNSPEC
13992 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13993 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13994 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13995 && !MEM_P (orig_x) && !addend)))
13996 result = XVECEXP (x, 0, 0);
13998 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13999 && !MEM_P (orig_x))
14000 result = XVECEXP (x, 0, 0);
14002 if (! result)
14003 return ix86_delegitimize_tls_address (orig_x);
14005 if (const_addend)
14006 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14007 if (reg_addend)
14008 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14009 if (addend)
14011 /* If the rest of original X doesn't involve the PIC register, add
14012 addend and subtract pic_offset_table_rtx. This can happen e.g.
14013 for code like:
14014 leal (%ebx, %ecx, 4), %ecx
14016 movl foo@GOTOFF(%ecx), %edx
14017 in which case we return (%ecx - %ebx) + foo. */
14018 if (pic_offset_table_rtx)
14019 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14020 pic_offset_table_rtx),
14021 result);
14022 else
14023 return orig_x;
14025 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14027 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14028 if (result == NULL_RTX)
14029 return orig_x;
14031 return result;
14034 /* If X is a machine specific address (i.e. a symbol or label being
14035 referenced as a displacement from the GOT implemented using an
14036 UNSPEC), then return the base term. Otherwise return X. */
14039 ix86_find_base_term (rtx x)
14041 rtx term;
14043 if (TARGET_64BIT)
14045 if (GET_CODE (x) != CONST)
14046 return x;
14047 term = XEXP (x, 0);
14048 if (GET_CODE (term) == PLUS
14049 && (CONST_INT_P (XEXP (term, 1))
14050 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14051 term = XEXP (term, 0);
14052 if (GET_CODE (term) != UNSPEC
14053 || (XINT (term, 1) != UNSPEC_GOTPCREL
14054 && XINT (term, 1) != UNSPEC_PCREL))
14055 return x;
14057 return XVECEXP (term, 0, 0);
14060 return ix86_delegitimize_address (x);
14063 static void
14064 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14065 bool fp, FILE *file)
14067 const char *suffix;
14069 if (mode == CCFPmode || mode == CCFPUmode)
14071 code = ix86_fp_compare_code_to_integer (code);
14072 mode = CCmode;
14074 if (reverse)
14075 code = reverse_condition (code);
14077 switch (code)
14079 case EQ:
14080 switch (mode)
14082 case CCAmode:
14083 suffix = "a";
14084 break;
14086 case CCCmode:
14087 suffix = "c";
14088 break;
14090 case CCOmode:
14091 suffix = "o";
14092 break;
14094 case CCSmode:
14095 suffix = "s";
14096 break;
14098 default:
14099 suffix = "e";
14101 break;
14102 case NE:
14103 switch (mode)
14105 case CCAmode:
14106 suffix = "na";
14107 break;
14109 case CCCmode:
14110 suffix = "nc";
14111 break;
14113 case CCOmode:
14114 suffix = "no";
14115 break;
14117 case CCSmode:
14118 suffix = "ns";
14119 break;
14121 default:
14122 suffix = "ne";
14124 break;
14125 case GT:
14126 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14127 suffix = "g";
14128 break;
14129 case GTU:
14130 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14131 Those same assemblers have the same but opposite lossage on cmov. */
14132 if (mode == CCmode)
14133 suffix = fp ? "nbe" : "a";
14134 else
14135 gcc_unreachable ();
14136 break;
14137 case LT:
14138 switch (mode)
14140 case CCNOmode:
14141 case CCGOCmode:
14142 suffix = "s";
14143 break;
14145 case CCmode:
14146 case CCGCmode:
14147 suffix = "l";
14148 break;
14150 default:
14151 gcc_unreachable ();
14153 break;
14154 case LTU:
14155 if (mode == CCmode)
14156 suffix = "b";
14157 else if (mode == CCCmode)
14158 suffix = "c";
14159 else
14160 gcc_unreachable ();
14161 break;
14162 case GE:
14163 switch (mode)
14165 case CCNOmode:
14166 case CCGOCmode:
14167 suffix = "ns";
14168 break;
14170 case CCmode:
14171 case CCGCmode:
14172 suffix = "ge";
14173 break;
14175 default:
14176 gcc_unreachable ();
14178 break;
14179 case GEU:
14180 if (mode == CCmode)
14181 suffix = fp ? "nb" : "ae";
14182 else if (mode == CCCmode)
14183 suffix = "nc";
14184 else
14185 gcc_unreachable ();
14186 break;
14187 case LE:
14188 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14189 suffix = "le";
14190 break;
14191 case LEU:
14192 if (mode == CCmode)
14193 suffix = "be";
14194 else
14195 gcc_unreachable ();
14196 break;
14197 case UNORDERED:
14198 suffix = fp ? "u" : "p";
14199 break;
14200 case ORDERED:
14201 suffix = fp ? "nu" : "np";
14202 break;
14203 default:
14204 gcc_unreachable ();
14206 fputs (suffix, file);
14209 /* Print the name of register X to FILE based on its machine mode and number.
14210 If CODE is 'w', pretend the mode is HImode.
14211 If CODE is 'b', pretend the mode is QImode.
14212 If CODE is 'k', pretend the mode is SImode.
14213 If CODE is 'q', pretend the mode is DImode.
14214 If CODE is 'x', pretend the mode is V4SFmode.
14215 If CODE is 't', pretend the mode is V8SFmode.
14216 If CODE is 'g', pretend the mode is V16SFmode.
14217 If CODE is 'h', pretend the reg is the 'high' byte register.
14218 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14219 If CODE is 'd', duplicate the operand for AVX instruction.
14222 void
14223 print_reg (rtx x, int code, FILE *file)
14225 const char *reg;
14226 unsigned int regno;
14227 bool duplicated = code == 'd' && TARGET_AVX;
14229 if (ASSEMBLER_DIALECT == ASM_ATT)
14230 putc ('%', file);
14232 if (x == pc_rtx)
14234 gcc_assert (TARGET_64BIT);
14235 fputs ("rip", file);
14236 return;
14239 regno = true_regnum (x);
14240 gcc_assert (regno != ARG_POINTER_REGNUM
14241 && regno != FRAME_POINTER_REGNUM
14242 && regno != FLAGS_REG
14243 && regno != FPSR_REG
14244 && regno != FPCR_REG);
14246 if (code == 'w' || MMX_REG_P (x))
14247 code = 2;
14248 else if (code == 'b')
14249 code = 1;
14250 else if (code == 'k')
14251 code = 4;
14252 else if (code == 'q')
14253 code = 8;
14254 else if (code == 'y')
14255 code = 3;
14256 else if (code == 'h')
14257 code = 0;
14258 else if (code == 'x')
14259 code = 16;
14260 else if (code == 't')
14261 code = 32;
14262 else if (code == 'g')
14263 code = 64;
14264 else
14265 code = GET_MODE_SIZE (GET_MODE (x));
14267 /* Irritatingly, AMD extended registers use different naming convention
14268 from the normal registers: "r%d[bwd]" */
14269 if (REX_INT_REGNO_P (regno))
14271 gcc_assert (TARGET_64BIT);
14272 putc ('r', file);
14273 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14274 switch (code)
14276 case 0:
14277 error ("extended registers have no high halves");
14278 break;
14279 case 1:
14280 putc ('b', file);
14281 break;
14282 case 2:
14283 putc ('w', file);
14284 break;
14285 case 4:
14286 putc ('d', file);
14287 break;
14288 case 8:
14289 /* no suffix */
14290 break;
14291 default:
14292 error ("unsupported operand size for extended register");
14293 break;
14295 return;
14298 reg = NULL;
14299 switch (code)
14301 case 3:
14302 if (STACK_TOP_P (x))
14304 reg = "st(0)";
14305 break;
14307 /* FALLTHRU */
14308 case 8:
14309 case 4:
14310 case 12:
14311 if (! ANY_FP_REG_P (x) && ! ANY_BND_REG_P (x))
14312 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14313 /* FALLTHRU */
14314 case 16:
14315 case 2:
14316 normal:
14317 reg = hi_reg_name[regno];
14318 break;
14319 case 1:
14320 if (regno >= ARRAY_SIZE (qi_reg_name))
14321 goto normal;
14322 reg = qi_reg_name[regno];
14323 break;
14324 case 0:
14325 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14326 goto normal;
14327 reg = qi_high_reg_name[regno];
14328 break;
14329 case 32:
14330 if (SSE_REG_P (x))
14332 gcc_assert (!duplicated);
14333 putc ('y', file);
14334 fputs (hi_reg_name[regno] + 1, file);
14335 return;
14337 case 64:
14338 if (SSE_REG_P (x))
14340 gcc_assert (!duplicated);
14341 putc ('z', file);
14342 fputs (hi_reg_name[REGNO (x)] + 1, file);
14343 return;
14345 break;
14346 default:
14347 gcc_unreachable ();
14350 fputs (reg, file);
14351 if (duplicated)
14353 if (ASSEMBLER_DIALECT == ASM_ATT)
14354 fprintf (file, ", %%%s", reg);
14355 else
14356 fprintf (file, ", %s", reg);
14360 /* Locate some local-dynamic symbol still in use by this function
14361 so that we can print its name in some tls_local_dynamic_base
14362 pattern. */
14364 static int
14365 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14367 rtx x = *px;
14369 if (GET_CODE (x) == SYMBOL_REF
14370 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14372 cfun->machine->some_ld_name = XSTR (x, 0);
14373 return 1;
14376 return 0;
14379 static const char *
14380 get_some_local_dynamic_name (void)
14382 rtx insn;
14384 if (cfun->machine->some_ld_name)
14385 return cfun->machine->some_ld_name;
14387 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14388 if (NONDEBUG_INSN_P (insn)
14389 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14390 return cfun->machine->some_ld_name;
14392 return NULL;
14395 /* Meaning of CODE:
14396 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14397 C -- print opcode suffix for set/cmov insn.
14398 c -- like C, but print reversed condition
14399 F,f -- likewise, but for floating-point.
14400 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14401 otherwise nothing
14402 R -- print the prefix for register names.
14403 z -- print the opcode suffix for the size of the current operand.
14404 Z -- likewise, with special suffixes for x87 instructions.
14405 * -- print a star (in certain assembler syntax)
14406 A -- print an absolute memory reference.
14407 E -- print address with DImode register names if TARGET_64BIT.
14408 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14409 s -- print a shift double count, followed by the assemblers argument
14410 delimiter.
14411 b -- print the QImode name of the register for the indicated operand.
14412 %b0 would print %al if operands[0] is reg 0.
14413 w -- likewise, print the HImode name of the register.
14414 k -- likewise, print the SImode name of the register.
14415 q -- likewise, print the DImode name of the register.
14416 x -- likewise, print the V4SFmode name of the register.
14417 t -- likewise, print the V8SFmode name of the register.
14418 g -- likewise, print the V16SFmode name of the register.
14419 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14420 y -- print "st(0)" instead of "st" as a register.
14421 d -- print duplicated register operand for AVX instruction.
14422 D -- print condition for SSE cmp instruction.
14423 P -- if PIC, print an @PLT suffix.
14424 p -- print raw symbol name.
14425 X -- don't print any sort of PIC '@' suffix for a symbol.
14426 & -- print some in-use local-dynamic symbol name.
14427 H -- print a memory address offset by 8; used for sse high-parts
14428 Y -- print condition for XOP pcom* instruction.
14429 + -- print a branch hint as 'cs' or 'ds' prefix
14430 ; -- print a semicolon (after prefixes due to bug in older gas).
14431 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14432 @ -- print a segment register of thread base pointer load
14433 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14434 ! -- print MPX prefix for jxx/call/ret instructions if required.
14437 void
14438 ix86_print_operand (FILE *file, rtx x, int code)
14440 if (code)
14442 switch (code)
14444 case 'A':
14445 switch (ASSEMBLER_DIALECT)
14447 case ASM_ATT:
14448 putc ('*', file);
14449 break;
14451 case ASM_INTEL:
14452 /* Intel syntax. For absolute addresses, registers should not
14453 be surrounded by braces. */
14454 if (!REG_P (x))
14456 putc ('[', file);
14457 ix86_print_operand (file, x, 0);
14458 putc (']', file);
14459 return;
14461 break;
14463 default:
14464 gcc_unreachable ();
14467 ix86_print_operand (file, x, 0);
14468 return;
14470 case 'E':
14471 /* Wrap address in an UNSPEC to declare special handling. */
14472 if (TARGET_64BIT)
14473 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14475 output_address (x);
14476 return;
14478 case 'L':
14479 if (ASSEMBLER_DIALECT == ASM_ATT)
14480 putc ('l', file);
14481 return;
14483 case 'W':
14484 if (ASSEMBLER_DIALECT == ASM_ATT)
14485 putc ('w', file);
14486 return;
14488 case 'B':
14489 if (ASSEMBLER_DIALECT == ASM_ATT)
14490 putc ('b', file);
14491 return;
14493 case 'Q':
14494 if (ASSEMBLER_DIALECT == ASM_ATT)
14495 putc ('l', file);
14496 return;
14498 case 'S':
14499 if (ASSEMBLER_DIALECT == ASM_ATT)
14500 putc ('s', file);
14501 return;
14503 case 'T':
14504 if (ASSEMBLER_DIALECT == ASM_ATT)
14505 putc ('t', file);
14506 return;
14508 case 'O':
14509 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14510 if (ASSEMBLER_DIALECT != ASM_ATT)
14511 return;
14513 switch (GET_MODE_SIZE (GET_MODE (x)))
14515 case 2:
14516 putc ('w', file);
14517 break;
14519 case 4:
14520 putc ('l', file);
14521 break;
14523 case 8:
14524 putc ('q', file);
14525 break;
14527 default:
14528 output_operand_lossage
14529 ("invalid operand size for operand code 'O'");
14530 return;
14533 putc ('.', file);
14534 #endif
14535 return;
14537 case 'z':
14538 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14540 /* Opcodes don't get size suffixes if using Intel opcodes. */
14541 if (ASSEMBLER_DIALECT == ASM_INTEL)
14542 return;
14544 switch (GET_MODE_SIZE (GET_MODE (x)))
14546 case 1:
14547 putc ('b', file);
14548 return;
14550 case 2:
14551 putc ('w', file);
14552 return;
14554 case 4:
14555 putc ('l', file);
14556 return;
14558 case 8:
14559 putc ('q', file);
14560 return;
14562 default:
14563 output_operand_lossage
14564 ("invalid operand size for operand code 'z'");
14565 return;
14569 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14570 warning
14571 (0, "non-integer operand used with operand code 'z'");
14572 /* FALLTHRU */
14574 case 'Z':
14575 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14576 if (ASSEMBLER_DIALECT == ASM_INTEL)
14577 return;
14579 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14581 switch (GET_MODE_SIZE (GET_MODE (x)))
14583 case 2:
14584 #ifdef HAVE_AS_IX86_FILDS
14585 putc ('s', file);
14586 #endif
14587 return;
14589 case 4:
14590 putc ('l', file);
14591 return;
14593 case 8:
14594 #ifdef HAVE_AS_IX86_FILDQ
14595 putc ('q', file);
14596 #else
14597 fputs ("ll", file);
14598 #endif
14599 return;
14601 default:
14602 break;
14605 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14607 /* 387 opcodes don't get size suffixes
14608 if the operands are registers. */
14609 if (STACK_REG_P (x))
14610 return;
14612 switch (GET_MODE_SIZE (GET_MODE (x)))
14614 case 4:
14615 putc ('s', file);
14616 return;
14618 case 8:
14619 putc ('l', file);
14620 return;
14622 case 12:
14623 case 16:
14624 putc ('t', file);
14625 return;
14627 default:
14628 break;
14631 else
14633 output_operand_lossage
14634 ("invalid operand type used with operand code 'Z'");
14635 return;
14638 output_operand_lossage
14639 ("invalid operand size for operand code 'Z'");
14640 return;
14642 case 'd':
14643 case 'b':
14644 case 'w':
14645 case 'k':
14646 case 'q':
14647 case 'h':
14648 case 't':
14649 case 'g':
14650 case 'y':
14651 case 'x':
14652 case 'X':
14653 case 'P':
14654 case 'p':
14655 break;
14657 case 's':
14658 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14660 ix86_print_operand (file, x, 0);
14661 fputs (", ", file);
14663 return;
14665 case 'Y':
14666 switch (GET_CODE (x))
14668 case NE:
14669 fputs ("neq", file);
14670 break;
14671 case EQ:
14672 fputs ("eq", file);
14673 break;
14674 case GE:
14675 case GEU:
14676 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14677 break;
14678 case GT:
14679 case GTU:
14680 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14681 break;
14682 case LE:
14683 case LEU:
14684 fputs ("le", file);
14685 break;
14686 case LT:
14687 case LTU:
14688 fputs ("lt", file);
14689 break;
14690 case UNORDERED:
14691 fputs ("unord", file);
14692 break;
14693 case ORDERED:
14694 fputs ("ord", file);
14695 break;
14696 case UNEQ:
14697 fputs ("ueq", file);
14698 break;
14699 case UNGE:
14700 fputs ("nlt", file);
14701 break;
14702 case UNGT:
14703 fputs ("nle", file);
14704 break;
14705 case UNLE:
14706 fputs ("ule", file);
14707 break;
14708 case UNLT:
14709 fputs ("ult", file);
14710 break;
14711 case LTGT:
14712 fputs ("une", file);
14713 break;
14714 default:
14715 output_operand_lossage ("operand is not a condition code, "
14716 "invalid operand code 'Y'");
14717 return;
14719 return;
14721 case 'D':
14722 /* Little bit of braindamage here. The SSE compare instructions
14723 does use completely different names for the comparisons that the
14724 fp conditional moves. */
14725 switch (GET_CODE (x))
14727 case UNEQ:
14728 if (TARGET_AVX)
14730 fputs ("eq_us", file);
14731 break;
14733 case EQ:
14734 fputs ("eq", file);
14735 break;
14736 case UNLT:
14737 if (TARGET_AVX)
14739 fputs ("nge", file);
14740 break;
14742 case LT:
14743 fputs ("lt", file);
14744 break;
14745 case UNLE:
14746 if (TARGET_AVX)
14748 fputs ("ngt", file);
14749 break;
14751 case LE:
14752 fputs ("le", file);
14753 break;
14754 case UNORDERED:
14755 fputs ("unord", file);
14756 break;
14757 case LTGT:
14758 if (TARGET_AVX)
14760 fputs ("neq_oq", file);
14761 break;
14763 case NE:
14764 fputs ("neq", file);
14765 break;
14766 case GE:
14767 if (TARGET_AVX)
14769 fputs ("ge", file);
14770 break;
14772 case UNGE:
14773 fputs ("nlt", file);
14774 break;
14775 case GT:
14776 if (TARGET_AVX)
14778 fputs ("gt", file);
14779 break;
14781 case UNGT:
14782 fputs ("nle", file);
14783 break;
14784 case ORDERED:
14785 fputs ("ord", file);
14786 break;
14787 default:
14788 output_operand_lossage ("operand is not a condition code, "
14789 "invalid operand code 'D'");
14790 return;
14792 return;
14794 case 'F':
14795 case 'f':
14796 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14797 if (ASSEMBLER_DIALECT == ASM_ATT)
14798 putc ('.', file);
14799 #endif
14801 case 'C':
14802 case 'c':
14803 if (!COMPARISON_P (x))
14805 output_operand_lossage ("operand is not a condition code, "
14806 "invalid operand code '%c'", code);
14807 return;
14809 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14810 code == 'c' || code == 'f',
14811 code == 'F' || code == 'f',
14812 file);
14813 return;
14815 case 'H':
14816 if (!offsettable_memref_p (x))
14818 output_operand_lossage ("operand is not an offsettable memory "
14819 "reference, invalid operand code 'H'");
14820 return;
14822 /* It doesn't actually matter what mode we use here, as we're
14823 only going to use this for printing. */
14824 x = adjust_address_nv (x, DImode, 8);
14825 /* Output 'qword ptr' for intel assembler dialect. */
14826 if (ASSEMBLER_DIALECT == ASM_INTEL)
14827 code = 'q';
14828 break;
14830 case 'K':
14831 gcc_assert (CONST_INT_P (x));
14833 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14834 #ifdef HAVE_AS_IX86_HLE
14835 fputs ("xacquire ", file);
14836 #else
14837 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14838 #endif
14839 else if (INTVAL (x) & IX86_HLE_RELEASE)
14840 #ifdef HAVE_AS_IX86_HLE
14841 fputs ("xrelease ", file);
14842 #else
14843 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14844 #endif
14845 /* We do not want to print value of the operand. */
14846 return;
14848 case '*':
14849 if (ASSEMBLER_DIALECT == ASM_ATT)
14850 putc ('*', file);
14851 return;
14853 case '&':
14855 const char *name = get_some_local_dynamic_name ();
14856 if (name == NULL)
14857 output_operand_lossage ("'%%&' used without any "
14858 "local dynamic TLS references");
14859 else
14860 assemble_name (file, name);
14861 return;
14864 case '+':
14866 rtx x;
14868 if (!optimize
14869 || optimize_function_for_size_p (cfun)
14870 || !TARGET_BRANCH_PREDICTION_HINTS)
14871 return;
14873 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14874 if (x)
14876 int pred_val = XINT (x, 0);
14878 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14879 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14881 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14882 bool cputaken
14883 = final_forward_branch_p (current_output_insn) == 0;
14885 /* Emit hints only in the case default branch prediction
14886 heuristics would fail. */
14887 if (taken != cputaken)
14889 /* We use 3e (DS) prefix for taken branches and
14890 2e (CS) prefix for not taken branches. */
14891 if (taken)
14892 fputs ("ds ; ", file);
14893 else
14894 fputs ("cs ; ", file);
14898 return;
14901 case ';':
14902 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14903 putc (';', file);
14904 #endif
14905 return;
14907 case '@':
14908 if (ASSEMBLER_DIALECT == ASM_ATT)
14909 putc ('%', file);
14911 /* The kernel uses a different segment register for performance
14912 reasons; a system call would not have to trash the userspace
14913 segment register, which would be expensive. */
14914 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14915 fputs ("fs", file);
14916 else
14917 fputs ("gs", file);
14918 return;
14920 case '~':
14921 putc (TARGET_AVX2 ? 'i' : 'f', file);
14922 return;
14924 case '^':
14925 if (TARGET_64BIT && Pmode != word_mode)
14926 fputs ("addr32 ", file);
14927 return;
14929 case '!':
14930 if (ix86_bnd_prefixed_insn_p (NULL_RTX))
14931 fputs ("bnd ", file);
14932 return;
14934 default:
14935 output_operand_lossage ("invalid operand code '%c'", code);
14939 if (REG_P (x))
14940 print_reg (x, code, file);
14942 else if (MEM_P (x))
14944 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14945 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14946 && GET_MODE (x) != BLKmode)
14948 const char * size;
14949 switch (GET_MODE_SIZE (GET_MODE (x)))
14951 case 1: size = "BYTE"; break;
14952 case 2: size = "WORD"; break;
14953 case 4: size = "DWORD"; break;
14954 case 8: size = "QWORD"; break;
14955 case 12: size = "TBYTE"; break;
14956 case 16:
14957 if (GET_MODE (x) == XFmode)
14958 size = "TBYTE";
14959 else
14960 size = "XMMWORD";
14961 break;
14962 case 32: size = "YMMWORD"; break;
14963 case 64: size = "ZMMWORD"; break;
14964 default:
14965 gcc_unreachable ();
14968 /* Check for explicit size override (codes 'b', 'w', 'k',
14969 'q' and 'x') */
14970 if (code == 'b')
14971 size = "BYTE";
14972 else if (code == 'w')
14973 size = "WORD";
14974 else if (code == 'k')
14975 size = "DWORD";
14976 else if (code == 'q')
14977 size = "QWORD";
14978 else if (code == 'x')
14979 size = "XMMWORD";
14981 fputs (size, file);
14982 fputs (" PTR ", file);
14985 x = XEXP (x, 0);
14986 /* Avoid (%rip) for call operands. */
14987 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14988 && !CONST_INT_P (x))
14989 output_addr_const (file, x);
14990 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14991 output_operand_lossage ("invalid constraints for operand");
14992 else
14993 output_address (x);
14996 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14998 REAL_VALUE_TYPE r;
14999 long l;
15001 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15002 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15004 if (ASSEMBLER_DIALECT == ASM_ATT)
15005 putc ('$', file);
15006 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15007 if (code == 'q')
15008 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15009 (unsigned long long) (int) l);
15010 else
15011 fprintf (file, "0x%08x", (unsigned int) l);
15014 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15016 REAL_VALUE_TYPE r;
15017 long l[2];
15019 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15020 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15022 if (ASSEMBLER_DIALECT == ASM_ATT)
15023 putc ('$', file);
15024 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15027 /* These float cases don't actually occur as immediate operands. */
15028 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15030 char dstr[30];
15032 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15033 fputs (dstr, file);
15036 else
15038 /* We have patterns that allow zero sets of memory, for instance.
15039 In 64-bit mode, we should probably support all 8-byte vectors,
15040 since we can in fact encode that into an immediate. */
15041 if (GET_CODE (x) == CONST_VECTOR)
15043 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15044 x = const0_rtx;
15047 if (code != 'P' && code != 'p')
15049 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15051 if (ASSEMBLER_DIALECT == ASM_ATT)
15052 putc ('$', file);
15054 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15055 || GET_CODE (x) == LABEL_REF)
15057 if (ASSEMBLER_DIALECT == ASM_ATT)
15058 putc ('$', file);
15059 else
15060 fputs ("OFFSET FLAT:", file);
15063 if (CONST_INT_P (x))
15064 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15065 else if (flag_pic || MACHOPIC_INDIRECT)
15066 output_pic_addr_const (file, x, code);
15067 else
15068 output_addr_const (file, x);
15072 static bool
15073 ix86_print_operand_punct_valid_p (unsigned char code)
15075 return (code == '@' || code == '*' || code == '+' || code == '&'
15076 || code == ';' || code == '~' || code == '^' || code == '!');
15079 /* Print a memory operand whose address is ADDR. */
15081 static void
15082 ix86_print_operand_address (FILE *file, rtx addr)
15084 struct ix86_address parts;
15085 rtx base, index, disp;
15086 int scale;
15087 int ok;
15088 bool vsib = false;
15089 int code = 0;
15091 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15093 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15094 gcc_assert (parts.index == NULL_RTX);
15095 parts.index = XVECEXP (addr, 0, 1);
15096 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15097 addr = XVECEXP (addr, 0, 0);
15098 vsib = true;
15100 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15102 gcc_assert (TARGET_64BIT);
15103 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15104 code = 'q';
15106 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
15108 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
15109 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
15110 if (parts.base != NULL_RTX)
15112 parts.index = parts.base;
15113 parts.scale = 1;
15115 parts.base = XVECEXP (addr, 0, 0);
15116 addr = XVECEXP (addr, 0, 0);
15118 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
15120 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15121 gcc_assert (parts.index == NULL_RTX);
15122 parts.index = XVECEXP (addr, 0, 1);
15123 addr = XVECEXP (addr, 0, 0);
15125 else
15126 ok = ix86_decompose_address (addr, &parts);
15128 gcc_assert (ok);
15130 base = parts.base;
15131 index = parts.index;
15132 disp = parts.disp;
15133 scale = parts.scale;
15135 switch (parts.seg)
15137 case SEG_DEFAULT:
15138 break;
15139 case SEG_FS:
15140 case SEG_GS:
15141 if (ASSEMBLER_DIALECT == ASM_ATT)
15142 putc ('%', file);
15143 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15144 break;
15145 default:
15146 gcc_unreachable ();
15149 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15150 if (TARGET_64BIT && !base && !index)
15152 rtx symbol = disp;
15154 if (GET_CODE (disp) == CONST
15155 && GET_CODE (XEXP (disp, 0)) == PLUS
15156 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15157 symbol = XEXP (XEXP (disp, 0), 0);
15159 if (GET_CODE (symbol) == LABEL_REF
15160 || (GET_CODE (symbol) == SYMBOL_REF
15161 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15162 base = pc_rtx;
15164 if (!base && !index)
15166 /* Displacement only requires special attention. */
15168 if (CONST_INT_P (disp))
15170 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15171 fputs ("ds:", file);
15172 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15174 else if (flag_pic)
15175 output_pic_addr_const (file, disp, 0);
15176 else
15177 output_addr_const (file, disp);
15179 else
15181 /* Print SImode register names to force addr32 prefix. */
15182 if (SImode_address_operand (addr, VOIDmode))
15184 #ifdef ENABLE_CHECKING
15185 gcc_assert (TARGET_64BIT);
15186 switch (GET_CODE (addr))
15188 case SUBREG:
15189 gcc_assert (GET_MODE (addr) == SImode);
15190 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15191 break;
15192 case ZERO_EXTEND:
15193 case AND:
15194 gcc_assert (GET_MODE (addr) == DImode);
15195 break;
15196 default:
15197 gcc_unreachable ();
15199 #endif
15200 gcc_assert (!code);
15201 code = 'k';
15203 else if (code == 0
15204 && TARGET_X32
15205 && disp
15206 && CONST_INT_P (disp)
15207 && INTVAL (disp) < -16*1024*1024)
15209 /* X32 runs in 64-bit mode, where displacement, DISP, in
15210 address DISP(%r64), is encoded as 32-bit immediate sign-
15211 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15212 address is %r64 + 0xffffffffbffffd00. When %r64 <
15213 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15214 which is invalid for x32. The correct address is %r64
15215 - 0x40000300 == 0xf7ffdd64. To properly encode
15216 -0x40000300(%r64) for x32, we zero-extend negative
15217 displacement by forcing addr32 prefix which truncates
15218 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15219 zero-extend all negative displacements, including -1(%rsp).
15220 However, for small negative displacements, sign-extension
15221 won't cause overflow. We only zero-extend negative
15222 displacements if they < -16*1024*1024, which is also used
15223 to check legitimate address displacements for PIC. */
15224 code = 'k';
15227 if (ASSEMBLER_DIALECT == ASM_ATT)
15229 if (disp)
15231 if (flag_pic)
15232 output_pic_addr_const (file, disp, 0);
15233 else if (GET_CODE (disp) == LABEL_REF)
15234 output_asm_label (disp);
15235 else
15236 output_addr_const (file, disp);
15239 putc ('(', file);
15240 if (base)
15241 print_reg (base, code, file);
15242 if (index)
15244 putc (',', file);
15245 print_reg (index, vsib ? 0 : code, file);
15246 if (scale != 1 || vsib)
15247 fprintf (file, ",%d", scale);
15249 putc (')', file);
15251 else
15253 rtx offset = NULL_RTX;
15255 if (disp)
15257 /* Pull out the offset of a symbol; print any symbol itself. */
15258 if (GET_CODE (disp) == CONST
15259 && GET_CODE (XEXP (disp, 0)) == PLUS
15260 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15262 offset = XEXP (XEXP (disp, 0), 1);
15263 disp = gen_rtx_CONST (VOIDmode,
15264 XEXP (XEXP (disp, 0), 0));
15267 if (flag_pic)
15268 output_pic_addr_const (file, disp, 0);
15269 else if (GET_CODE (disp) == LABEL_REF)
15270 output_asm_label (disp);
15271 else if (CONST_INT_P (disp))
15272 offset = disp;
15273 else
15274 output_addr_const (file, disp);
15277 putc ('[', file);
15278 if (base)
15280 print_reg (base, code, file);
15281 if (offset)
15283 if (INTVAL (offset) >= 0)
15284 putc ('+', file);
15285 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15288 else if (offset)
15289 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15290 else
15291 putc ('0', file);
15293 if (index)
15295 putc ('+', file);
15296 print_reg (index, vsib ? 0 : code, file);
15297 if (scale != 1 || vsib)
15298 fprintf (file, "*%d", scale);
15300 putc (']', file);
15305 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15307 static bool
15308 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15310 rtx op;
15312 if (GET_CODE (x) != UNSPEC)
15313 return false;
15315 op = XVECEXP (x, 0, 0);
15316 switch (XINT (x, 1))
15318 case UNSPEC_GOTTPOFF:
15319 output_addr_const (file, op);
15320 /* FIXME: This might be @TPOFF in Sun ld. */
15321 fputs ("@gottpoff", file);
15322 break;
15323 case UNSPEC_TPOFF:
15324 output_addr_const (file, op);
15325 fputs ("@tpoff", file);
15326 break;
15327 case UNSPEC_NTPOFF:
15328 output_addr_const (file, op);
15329 if (TARGET_64BIT)
15330 fputs ("@tpoff", file);
15331 else
15332 fputs ("@ntpoff", file);
15333 break;
15334 case UNSPEC_DTPOFF:
15335 output_addr_const (file, op);
15336 fputs ("@dtpoff", file);
15337 break;
15338 case UNSPEC_GOTNTPOFF:
15339 output_addr_const (file, op);
15340 if (TARGET_64BIT)
15341 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15342 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15343 else
15344 fputs ("@gotntpoff", file);
15345 break;
15346 case UNSPEC_INDNTPOFF:
15347 output_addr_const (file, op);
15348 fputs ("@indntpoff", file);
15349 break;
15350 #if TARGET_MACHO
15351 case UNSPEC_MACHOPIC_OFFSET:
15352 output_addr_const (file, op);
15353 putc ('-', file);
15354 machopic_output_function_base_name (file);
15355 break;
15356 #endif
15358 case UNSPEC_STACK_CHECK:
15360 int offset;
15362 gcc_assert (flag_split_stack);
15364 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15365 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15366 #else
15367 gcc_unreachable ();
15368 #endif
15370 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15372 break;
15374 default:
15375 return false;
15378 return true;
15381 /* Split one or more double-mode RTL references into pairs of half-mode
15382 references. The RTL can be REG, offsettable MEM, integer constant, or
15383 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15384 split and "num" is its length. lo_half and hi_half are output arrays
15385 that parallel "operands". */
15387 void
15388 split_double_mode (enum machine_mode mode, rtx operands[],
15389 int num, rtx lo_half[], rtx hi_half[])
15391 enum machine_mode half_mode;
15392 unsigned int byte;
15394 switch (mode)
15396 case TImode:
15397 half_mode = DImode;
15398 break;
15399 case DImode:
15400 half_mode = SImode;
15401 break;
15402 default:
15403 gcc_unreachable ();
15406 byte = GET_MODE_SIZE (half_mode);
15408 while (num--)
15410 rtx op = operands[num];
15412 /* simplify_subreg refuse to split volatile memory addresses,
15413 but we still have to handle it. */
15414 if (MEM_P (op))
15416 lo_half[num] = adjust_address (op, half_mode, 0);
15417 hi_half[num] = adjust_address (op, half_mode, byte);
15419 else
15421 lo_half[num] = simplify_gen_subreg (half_mode, op,
15422 GET_MODE (op) == VOIDmode
15423 ? mode : GET_MODE (op), 0);
15424 hi_half[num] = simplify_gen_subreg (half_mode, op,
15425 GET_MODE (op) == VOIDmode
15426 ? mode : GET_MODE (op), byte);
15431 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15432 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15433 is the expression of the binary operation. The output may either be
15434 emitted here, or returned to the caller, like all output_* functions.
15436 There is no guarantee that the operands are the same mode, as they
15437 might be within FLOAT or FLOAT_EXTEND expressions. */
15439 #ifndef SYSV386_COMPAT
15440 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15441 wants to fix the assemblers because that causes incompatibility
15442 with gcc. No-one wants to fix gcc because that causes
15443 incompatibility with assemblers... You can use the option of
15444 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15445 #define SYSV386_COMPAT 1
15446 #endif
15448 const char *
15449 output_387_binary_op (rtx insn, rtx *operands)
15451 static char buf[40];
15452 const char *p;
15453 const char *ssep;
15454 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15456 #ifdef ENABLE_CHECKING
15457 /* Even if we do not want to check the inputs, this documents input
15458 constraints. Which helps in understanding the following code. */
15459 if (STACK_REG_P (operands[0])
15460 && ((REG_P (operands[1])
15461 && REGNO (operands[0]) == REGNO (operands[1])
15462 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15463 || (REG_P (operands[2])
15464 && REGNO (operands[0]) == REGNO (operands[2])
15465 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15466 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15467 ; /* ok */
15468 else
15469 gcc_assert (is_sse);
15470 #endif
15472 switch (GET_CODE (operands[3]))
15474 case PLUS:
15475 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15476 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15477 p = "fiadd";
15478 else
15479 p = "fadd";
15480 ssep = "vadd";
15481 break;
15483 case MINUS:
15484 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15485 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15486 p = "fisub";
15487 else
15488 p = "fsub";
15489 ssep = "vsub";
15490 break;
15492 case MULT:
15493 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15494 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15495 p = "fimul";
15496 else
15497 p = "fmul";
15498 ssep = "vmul";
15499 break;
15501 case DIV:
15502 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15503 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15504 p = "fidiv";
15505 else
15506 p = "fdiv";
15507 ssep = "vdiv";
15508 break;
15510 default:
15511 gcc_unreachable ();
15514 if (is_sse)
15516 if (TARGET_AVX)
15518 strcpy (buf, ssep);
15519 if (GET_MODE (operands[0]) == SFmode)
15520 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15521 else
15522 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15524 else
15526 strcpy (buf, ssep + 1);
15527 if (GET_MODE (operands[0]) == SFmode)
15528 strcat (buf, "ss\t{%2, %0|%0, %2}");
15529 else
15530 strcat (buf, "sd\t{%2, %0|%0, %2}");
15532 return buf;
15534 strcpy (buf, p);
15536 switch (GET_CODE (operands[3]))
15538 case MULT:
15539 case PLUS:
15540 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15542 rtx temp = operands[2];
15543 operands[2] = operands[1];
15544 operands[1] = temp;
15547 /* know operands[0] == operands[1]. */
15549 if (MEM_P (operands[2]))
15551 p = "%Z2\t%2";
15552 break;
15555 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15557 if (STACK_TOP_P (operands[0]))
15558 /* How is it that we are storing to a dead operand[2]?
15559 Well, presumably operands[1] is dead too. We can't
15560 store the result to st(0) as st(0) gets popped on this
15561 instruction. Instead store to operands[2] (which I
15562 think has to be st(1)). st(1) will be popped later.
15563 gcc <= 2.8.1 didn't have this check and generated
15564 assembly code that the Unixware assembler rejected. */
15565 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15566 else
15567 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15568 break;
15571 if (STACK_TOP_P (operands[0]))
15572 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15573 else
15574 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15575 break;
15577 case MINUS:
15578 case DIV:
15579 if (MEM_P (operands[1]))
15581 p = "r%Z1\t%1";
15582 break;
15585 if (MEM_P (operands[2]))
15587 p = "%Z2\t%2";
15588 break;
15591 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15593 #if SYSV386_COMPAT
15594 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15595 derived assemblers, confusingly reverse the direction of
15596 the operation for fsub{r} and fdiv{r} when the
15597 destination register is not st(0). The Intel assembler
15598 doesn't have this brain damage. Read !SYSV386_COMPAT to
15599 figure out what the hardware really does. */
15600 if (STACK_TOP_P (operands[0]))
15601 p = "{p\t%0, %2|rp\t%2, %0}";
15602 else
15603 p = "{rp\t%2, %0|p\t%0, %2}";
15604 #else
15605 if (STACK_TOP_P (operands[0]))
15606 /* As above for fmul/fadd, we can't store to st(0). */
15607 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15608 else
15609 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15610 #endif
15611 break;
15614 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15616 #if SYSV386_COMPAT
15617 if (STACK_TOP_P (operands[0]))
15618 p = "{rp\t%0, %1|p\t%1, %0}";
15619 else
15620 p = "{p\t%1, %0|rp\t%0, %1}";
15621 #else
15622 if (STACK_TOP_P (operands[0]))
15623 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15624 else
15625 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15626 #endif
15627 break;
15630 if (STACK_TOP_P (operands[0]))
15632 if (STACK_TOP_P (operands[1]))
15633 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15634 else
15635 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15636 break;
15638 else if (STACK_TOP_P (operands[1]))
15640 #if SYSV386_COMPAT
15641 p = "{\t%1, %0|r\t%0, %1}";
15642 #else
15643 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15644 #endif
15646 else
15648 #if SYSV386_COMPAT
15649 p = "{r\t%2, %0|\t%0, %2}";
15650 #else
15651 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15652 #endif
15654 break;
15656 default:
15657 gcc_unreachable ();
15660 strcat (buf, p);
15661 return buf;
15664 /* Check if a 256bit AVX register is referenced inside of EXP. */
15666 static int
15667 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15669 rtx exp = *pexp;
15671 if (GET_CODE (exp) == SUBREG)
15672 exp = SUBREG_REG (exp);
15674 if (REG_P (exp)
15675 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15676 return 1;
15678 return 0;
15681 /* Return needed mode for entity in optimize_mode_switching pass. */
15683 static int
15684 ix86_avx_u128_mode_needed (rtx insn)
15686 if (CALL_P (insn))
15688 rtx link;
15690 /* Needed mode is set to AVX_U128_CLEAN if there are
15691 no 256bit modes used in function arguments. */
15692 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15693 link;
15694 link = XEXP (link, 1))
15696 if (GET_CODE (XEXP (link, 0)) == USE)
15698 rtx arg = XEXP (XEXP (link, 0), 0);
15700 if (ix86_check_avx256_register (&arg, NULL))
15701 return AVX_U128_ANY;
15705 return AVX_U128_CLEAN;
15708 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15709 changes state only when a 256bit register is written to, but we need
15710 to prevent the compiler from moving optimal insertion point above
15711 eventual read from 256bit register. */
15712 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15713 return AVX_U128_DIRTY;
15715 return AVX_U128_ANY;
15718 /* Return mode that i387 must be switched into
15719 prior to the execution of insn. */
15721 static int
15722 ix86_i387_mode_needed (int entity, rtx insn)
15724 enum attr_i387_cw mode;
15726 /* The mode UNINITIALIZED is used to store control word after a
15727 function call or ASM pattern. The mode ANY specify that function
15728 has no requirements on the control word and make no changes in the
15729 bits we are interested in. */
15731 if (CALL_P (insn)
15732 || (NONJUMP_INSN_P (insn)
15733 && (asm_noperands (PATTERN (insn)) >= 0
15734 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15735 return I387_CW_UNINITIALIZED;
15737 if (recog_memoized (insn) < 0)
15738 return I387_CW_ANY;
15740 mode = get_attr_i387_cw (insn);
15742 switch (entity)
15744 case I387_TRUNC:
15745 if (mode == I387_CW_TRUNC)
15746 return mode;
15747 break;
15749 case I387_FLOOR:
15750 if (mode == I387_CW_FLOOR)
15751 return mode;
15752 break;
15754 case I387_CEIL:
15755 if (mode == I387_CW_CEIL)
15756 return mode;
15757 break;
15759 case I387_MASK_PM:
15760 if (mode == I387_CW_MASK_PM)
15761 return mode;
15762 break;
15764 default:
15765 gcc_unreachable ();
15768 return I387_CW_ANY;
15771 /* Return mode that entity must be switched into
15772 prior to the execution of insn. */
15775 ix86_mode_needed (int entity, rtx insn)
15777 switch (entity)
15779 case AVX_U128:
15780 return ix86_avx_u128_mode_needed (insn);
15781 case I387_TRUNC:
15782 case I387_FLOOR:
15783 case I387_CEIL:
15784 case I387_MASK_PM:
15785 return ix86_i387_mode_needed (entity, insn);
15786 default:
15787 gcc_unreachable ();
15789 return 0;
15792 /* Check if a 256bit AVX register is referenced in stores. */
15794 static void
15795 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15797 if (ix86_check_avx256_register (&dest, NULL))
15799 bool *used = (bool *) data;
15800 *used = true;
15804 /* Calculate mode of upper 128bit AVX registers after the insn. */
15806 static int
15807 ix86_avx_u128_mode_after (int mode, rtx insn)
15809 rtx pat = PATTERN (insn);
15811 if (vzeroupper_operation (pat, VOIDmode)
15812 || vzeroall_operation (pat, VOIDmode))
15813 return AVX_U128_CLEAN;
15815 /* We know that state is clean after CALL insn if there are no
15816 256bit registers used in the function return register. */
15817 if (CALL_P (insn))
15819 bool avx_reg256_found = false;
15820 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15821 if (!avx_reg256_found)
15822 return AVX_U128_CLEAN;
15825 /* Otherwise, return current mode. Remember that if insn
15826 references AVX 256bit registers, the mode was already changed
15827 to DIRTY from MODE_NEEDED. */
15828 return mode;
15831 /* Return the mode that an insn results in. */
15834 ix86_mode_after (int entity, int mode, rtx insn)
15836 switch (entity)
15838 case AVX_U128:
15839 return ix86_avx_u128_mode_after (mode, insn);
15840 case I387_TRUNC:
15841 case I387_FLOOR:
15842 case I387_CEIL:
15843 case I387_MASK_PM:
15844 return mode;
15845 default:
15846 gcc_unreachable ();
15850 static int
15851 ix86_avx_u128_mode_entry (void)
15853 tree arg;
15855 /* Entry mode is set to AVX_U128_DIRTY if there are
15856 256bit modes used in function arguments. */
15857 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15858 arg = TREE_CHAIN (arg))
15860 rtx incoming = DECL_INCOMING_RTL (arg);
15862 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15863 return AVX_U128_DIRTY;
15866 return AVX_U128_CLEAN;
15869 /* Return a mode that ENTITY is assumed to be
15870 switched to at function entry. */
15873 ix86_mode_entry (int entity)
15875 switch (entity)
15877 case AVX_U128:
15878 return ix86_avx_u128_mode_entry ();
15879 case I387_TRUNC:
15880 case I387_FLOOR:
15881 case I387_CEIL:
15882 case I387_MASK_PM:
15883 return I387_CW_ANY;
15884 default:
15885 gcc_unreachable ();
15889 static int
15890 ix86_avx_u128_mode_exit (void)
15892 rtx reg = crtl->return_rtx;
15894 /* Exit mode is set to AVX_U128_DIRTY if there are
15895 256bit modes used in the function return register. */
15896 if (reg && ix86_check_avx256_register (&reg, NULL))
15897 return AVX_U128_DIRTY;
15899 return AVX_U128_CLEAN;
15902 /* Return a mode that ENTITY is assumed to be
15903 switched to at function exit. */
15906 ix86_mode_exit (int entity)
15908 switch (entity)
15910 case AVX_U128:
15911 return ix86_avx_u128_mode_exit ();
15912 case I387_TRUNC:
15913 case I387_FLOOR:
15914 case I387_CEIL:
15915 case I387_MASK_PM:
15916 return I387_CW_ANY;
15917 default:
15918 gcc_unreachable ();
15922 /* Output code to initialize control word copies used by trunc?f?i and
15923 rounding patterns. CURRENT_MODE is set to current control word,
15924 while NEW_MODE is set to new control word. */
15926 static void
15927 emit_i387_cw_initialization (int mode)
15929 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15930 rtx new_mode;
15932 enum ix86_stack_slot slot;
15934 rtx reg = gen_reg_rtx (HImode);
15936 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15937 emit_move_insn (reg, copy_rtx (stored_mode));
15939 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15940 || optimize_insn_for_size_p ())
15942 switch (mode)
15944 case I387_CW_TRUNC:
15945 /* round toward zero (truncate) */
15946 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15947 slot = SLOT_CW_TRUNC;
15948 break;
15950 case I387_CW_FLOOR:
15951 /* round down toward -oo */
15952 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15953 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15954 slot = SLOT_CW_FLOOR;
15955 break;
15957 case I387_CW_CEIL:
15958 /* round up toward +oo */
15959 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15960 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15961 slot = SLOT_CW_CEIL;
15962 break;
15964 case I387_CW_MASK_PM:
15965 /* mask precision exception for nearbyint() */
15966 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15967 slot = SLOT_CW_MASK_PM;
15968 break;
15970 default:
15971 gcc_unreachable ();
15974 else
15976 switch (mode)
15978 case I387_CW_TRUNC:
15979 /* round toward zero (truncate) */
15980 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15981 slot = SLOT_CW_TRUNC;
15982 break;
15984 case I387_CW_FLOOR:
15985 /* round down toward -oo */
15986 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15987 slot = SLOT_CW_FLOOR;
15988 break;
15990 case I387_CW_CEIL:
15991 /* round up toward +oo */
15992 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15993 slot = SLOT_CW_CEIL;
15994 break;
15996 case I387_CW_MASK_PM:
15997 /* mask precision exception for nearbyint() */
15998 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15999 slot = SLOT_CW_MASK_PM;
16000 break;
16002 default:
16003 gcc_unreachable ();
16007 gcc_assert (slot < MAX_386_STACK_LOCALS);
16009 new_mode = assign_386_stack_local (HImode, slot);
16010 emit_move_insn (new_mode, reg);
16013 /* Emit vzeroupper. */
16015 void
16016 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16018 int i;
16020 /* Cancel automatic vzeroupper insertion if there are
16021 live call-saved SSE registers at the insertion point. */
16023 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16024 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16025 return;
16027 if (TARGET_64BIT)
16028 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16029 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16030 return;
16032 emit_insn (gen_avx_vzeroupper ());
16035 /* Generate one or more insns to set ENTITY to MODE. */
16037 void
16038 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16040 switch (entity)
16042 case AVX_U128:
16043 if (mode == AVX_U128_CLEAN)
16044 ix86_avx_emit_vzeroupper (regs_live);
16045 break;
16046 case I387_TRUNC:
16047 case I387_FLOOR:
16048 case I387_CEIL:
16049 case I387_MASK_PM:
16050 if (mode != I387_CW_ANY
16051 && mode != I387_CW_UNINITIALIZED)
16052 emit_i387_cw_initialization (mode);
16053 break;
16054 default:
16055 gcc_unreachable ();
16059 /* Output code for INSN to convert a float to a signed int. OPERANDS
16060 are the insn operands. The output may be [HSD]Imode and the input
16061 operand may be [SDX]Fmode. */
16063 const char *
16064 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16066 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16067 int dimode_p = GET_MODE (operands[0]) == DImode;
16068 int round_mode = get_attr_i387_cw (insn);
16070 /* Jump through a hoop or two for DImode, since the hardware has no
16071 non-popping instruction. We used to do this a different way, but
16072 that was somewhat fragile and broke with post-reload splitters. */
16073 if ((dimode_p || fisttp) && !stack_top_dies)
16074 output_asm_insn ("fld\t%y1", operands);
16076 gcc_assert (STACK_TOP_P (operands[1]));
16077 gcc_assert (MEM_P (operands[0]));
16078 gcc_assert (GET_MODE (operands[1]) != TFmode);
16080 if (fisttp)
16081 output_asm_insn ("fisttp%Z0\t%0", operands);
16082 else
16084 if (round_mode != I387_CW_ANY)
16085 output_asm_insn ("fldcw\t%3", operands);
16086 if (stack_top_dies || dimode_p)
16087 output_asm_insn ("fistp%Z0\t%0", operands);
16088 else
16089 output_asm_insn ("fist%Z0\t%0", operands);
16090 if (round_mode != I387_CW_ANY)
16091 output_asm_insn ("fldcw\t%2", operands);
16094 return "";
16097 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16098 have the values zero or one, indicates the ffreep insn's operand
16099 from the OPERANDS array. */
16101 static const char *
16102 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16104 if (TARGET_USE_FFREEP)
16105 #ifdef HAVE_AS_IX86_FFREEP
16106 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16107 #else
16109 static char retval[32];
16110 int regno = REGNO (operands[opno]);
16112 gcc_assert (STACK_REGNO_P (regno));
16114 regno -= FIRST_STACK_REG;
16116 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16117 return retval;
16119 #endif
16121 return opno ? "fstp\t%y1" : "fstp\t%y0";
16125 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16126 should be used. UNORDERED_P is true when fucom should be used. */
16128 const char *
16129 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16131 int stack_top_dies;
16132 rtx cmp_op0, cmp_op1;
16133 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16135 if (eflags_p)
16137 cmp_op0 = operands[0];
16138 cmp_op1 = operands[1];
16140 else
16142 cmp_op0 = operands[1];
16143 cmp_op1 = operands[2];
16146 if (is_sse)
16148 if (GET_MODE (operands[0]) == SFmode)
16149 if (unordered_p)
16150 return "%vucomiss\t{%1, %0|%0, %1}";
16151 else
16152 return "%vcomiss\t{%1, %0|%0, %1}";
16153 else
16154 if (unordered_p)
16155 return "%vucomisd\t{%1, %0|%0, %1}";
16156 else
16157 return "%vcomisd\t{%1, %0|%0, %1}";
16160 gcc_assert (STACK_TOP_P (cmp_op0));
16162 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16164 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16166 if (stack_top_dies)
16168 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16169 return output_387_ffreep (operands, 1);
16171 else
16172 return "ftst\n\tfnstsw\t%0";
16175 if (STACK_REG_P (cmp_op1)
16176 && stack_top_dies
16177 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16178 && REGNO (cmp_op1) != FIRST_STACK_REG)
16180 /* If both the top of the 387 stack dies, and the other operand
16181 is also a stack register that dies, then this must be a
16182 `fcompp' float compare */
16184 if (eflags_p)
16186 /* There is no double popping fcomi variant. Fortunately,
16187 eflags is immune from the fstp's cc clobbering. */
16188 if (unordered_p)
16189 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16190 else
16191 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16192 return output_387_ffreep (operands, 0);
16194 else
16196 if (unordered_p)
16197 return "fucompp\n\tfnstsw\t%0";
16198 else
16199 return "fcompp\n\tfnstsw\t%0";
16202 else
16204 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16206 static const char * const alt[16] =
16208 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16209 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16210 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16211 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16213 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16214 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16215 NULL,
16216 NULL,
16218 "fcomi\t{%y1, %0|%0, %y1}",
16219 "fcomip\t{%y1, %0|%0, %y1}",
16220 "fucomi\t{%y1, %0|%0, %y1}",
16221 "fucomip\t{%y1, %0|%0, %y1}",
16223 NULL,
16224 NULL,
16225 NULL,
16226 NULL
16229 int mask;
16230 const char *ret;
16232 mask = eflags_p << 3;
16233 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16234 mask |= unordered_p << 1;
16235 mask |= stack_top_dies;
16237 gcc_assert (mask < 16);
16238 ret = alt[mask];
16239 gcc_assert (ret);
16241 return ret;
16245 void
16246 ix86_output_addr_vec_elt (FILE *file, int value)
16248 const char *directive = ASM_LONG;
16250 #ifdef ASM_QUAD
16251 if (TARGET_LP64)
16252 directive = ASM_QUAD;
16253 #else
16254 gcc_assert (!TARGET_64BIT);
16255 #endif
16257 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16260 void
16261 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16263 const char *directive = ASM_LONG;
16265 #ifdef ASM_QUAD
16266 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16267 directive = ASM_QUAD;
16268 #else
16269 gcc_assert (!TARGET_64BIT);
16270 #endif
16271 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16272 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16273 fprintf (file, "%s%s%d-%s%d\n",
16274 directive, LPREFIX, value, LPREFIX, rel);
16275 else if (HAVE_AS_GOTOFF_IN_DATA)
16276 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16277 #if TARGET_MACHO
16278 else if (TARGET_MACHO)
16280 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16281 machopic_output_function_base_name (file);
16282 putc ('\n', file);
16284 #endif
16285 else
16286 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16287 GOT_SYMBOL_NAME, LPREFIX, value);
16290 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16291 for the target. */
16293 void
16294 ix86_expand_clear (rtx dest)
16296 rtx tmp;
16298 /* We play register width games, which are only valid after reload. */
16299 gcc_assert (reload_completed);
16301 /* Avoid HImode and its attendant prefix byte. */
16302 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16303 dest = gen_rtx_REG (SImode, REGNO (dest));
16304 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16306 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16307 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16309 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16310 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16313 emit_insn (tmp);
16316 /* X is an unchanging MEM. If it is a constant pool reference, return
16317 the constant pool rtx, else NULL. */
16320 maybe_get_pool_constant (rtx x)
16322 x = ix86_delegitimize_address (XEXP (x, 0));
16324 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16325 return get_pool_constant (x);
16327 return NULL_RTX;
16330 void
16331 ix86_expand_move (enum machine_mode mode, rtx operands[])
16333 rtx op0, op1;
16334 enum tls_model model;
16336 op0 = operands[0];
16337 op1 = operands[1];
16339 if (GET_CODE (op1) == SYMBOL_REF)
16341 rtx tmp;
16343 model = SYMBOL_REF_TLS_MODEL (op1);
16344 if (model)
16346 op1 = legitimize_tls_address (op1, model, true);
16347 op1 = force_operand (op1, op0);
16348 if (op1 == op0)
16349 return;
16350 op1 = convert_to_mode (mode, op1, 1);
16352 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16353 op1 = tmp;
16355 else if (GET_CODE (op1) == CONST
16356 && GET_CODE (XEXP (op1, 0)) == PLUS
16357 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16359 rtx addend = XEXP (XEXP (op1, 0), 1);
16360 rtx symbol = XEXP (XEXP (op1, 0), 0);
16361 rtx tmp;
16363 model = SYMBOL_REF_TLS_MODEL (symbol);
16364 if (model)
16365 tmp = legitimize_tls_address (symbol, model, true);
16366 else
16367 tmp = legitimize_pe_coff_symbol (symbol, true);
16369 if (tmp)
16371 tmp = force_operand (tmp, NULL);
16372 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16373 op0, 1, OPTAB_DIRECT);
16374 if (tmp == op0)
16375 return;
16376 op1 = convert_to_mode (mode, tmp, 1);
16380 if ((flag_pic || MACHOPIC_INDIRECT)
16381 && symbolic_operand (op1, mode))
16383 if (TARGET_MACHO && !TARGET_64BIT)
16385 #if TARGET_MACHO
16386 /* dynamic-no-pic */
16387 if (MACHOPIC_INDIRECT)
16389 rtx temp = ((reload_in_progress
16390 || ((op0 && REG_P (op0))
16391 && mode == Pmode))
16392 ? op0 : gen_reg_rtx (Pmode));
16393 op1 = machopic_indirect_data_reference (op1, temp);
16394 if (MACHOPIC_PURE)
16395 op1 = machopic_legitimize_pic_address (op1, mode,
16396 temp == op1 ? 0 : temp);
16398 if (op0 != op1 && GET_CODE (op0) != MEM)
16400 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16401 emit_insn (insn);
16402 return;
16404 if (GET_CODE (op0) == MEM)
16405 op1 = force_reg (Pmode, op1);
16406 else
16408 rtx temp = op0;
16409 if (GET_CODE (temp) != REG)
16410 temp = gen_reg_rtx (Pmode);
16411 temp = legitimize_pic_address (op1, temp);
16412 if (temp == op0)
16413 return;
16414 op1 = temp;
16416 /* dynamic-no-pic */
16417 #endif
16419 else
16421 if (MEM_P (op0))
16422 op1 = force_reg (mode, op1);
16423 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16425 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16426 op1 = legitimize_pic_address (op1, reg);
16427 if (op0 == op1)
16428 return;
16429 op1 = convert_to_mode (mode, op1, 1);
16433 else
16435 if (MEM_P (op0)
16436 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16437 || !push_operand (op0, mode))
16438 && MEM_P (op1))
16439 op1 = force_reg (mode, op1);
16441 if (push_operand (op0, mode)
16442 && ! general_no_elim_operand (op1, mode))
16443 op1 = copy_to_mode_reg (mode, op1);
16445 /* Force large constants in 64bit compilation into register
16446 to get them CSEed. */
16447 if (can_create_pseudo_p ()
16448 && (mode == DImode) && TARGET_64BIT
16449 && immediate_operand (op1, mode)
16450 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16451 && !register_operand (op0, mode)
16452 && optimize)
16453 op1 = copy_to_mode_reg (mode, op1);
16455 if (can_create_pseudo_p ()
16456 && FLOAT_MODE_P (mode)
16457 && GET_CODE (op1) == CONST_DOUBLE)
16459 /* If we are loading a floating point constant to a register,
16460 force the value to memory now, since we'll get better code
16461 out the back end. */
16463 op1 = validize_mem (force_const_mem (mode, op1));
16464 if (!register_operand (op0, mode))
16466 rtx temp = gen_reg_rtx (mode);
16467 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16468 emit_move_insn (op0, temp);
16469 return;
16474 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16477 void
16478 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16480 rtx op0 = operands[0], op1 = operands[1];
16481 unsigned int align = GET_MODE_ALIGNMENT (mode);
16483 /* Force constants other than zero into memory. We do not know how
16484 the instructions used to build constants modify the upper 64 bits
16485 of the register, once we have that information we may be able
16486 to handle some of them more efficiently. */
16487 if (can_create_pseudo_p ()
16488 && register_operand (op0, mode)
16489 && (CONSTANT_P (op1)
16490 || (GET_CODE (op1) == SUBREG
16491 && CONSTANT_P (SUBREG_REG (op1))))
16492 && !standard_sse_constant_p (op1))
16493 op1 = validize_mem (force_const_mem (mode, op1));
16495 /* We need to check memory alignment for SSE mode since attribute
16496 can make operands unaligned. */
16497 if (can_create_pseudo_p ()
16498 && SSE_REG_MODE_P (mode)
16499 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16500 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16502 rtx tmp[2];
16504 /* ix86_expand_vector_move_misalign() does not like constants ... */
16505 if (CONSTANT_P (op1)
16506 || (GET_CODE (op1) == SUBREG
16507 && CONSTANT_P (SUBREG_REG (op1))))
16508 op1 = validize_mem (force_const_mem (mode, op1));
16510 /* ... nor both arguments in memory. */
16511 if (!register_operand (op0, mode)
16512 && !register_operand (op1, mode))
16513 op1 = force_reg (mode, op1);
16515 tmp[0] = op0; tmp[1] = op1;
16516 ix86_expand_vector_move_misalign (mode, tmp);
16517 return;
16520 /* Make operand1 a register if it isn't already. */
16521 if (can_create_pseudo_p ()
16522 && !register_operand (op0, mode)
16523 && !register_operand (op1, mode))
16525 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16526 return;
16529 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16532 /* Split 32-byte AVX unaligned load and store if needed. */
16534 static void
16535 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16537 rtx m;
16538 rtx (*extract) (rtx, rtx, rtx);
16539 rtx (*load_unaligned) (rtx, rtx);
16540 rtx (*store_unaligned) (rtx, rtx);
16541 enum machine_mode mode;
16543 switch (GET_MODE (op0))
16545 default:
16546 gcc_unreachable ();
16547 case V32QImode:
16548 extract = gen_avx_vextractf128v32qi;
16549 load_unaligned = gen_avx_loaddquv32qi;
16550 store_unaligned = gen_avx_storedquv32qi;
16551 mode = V16QImode;
16552 break;
16553 case V8SFmode:
16554 extract = gen_avx_vextractf128v8sf;
16555 load_unaligned = gen_avx_loadups256;
16556 store_unaligned = gen_avx_storeups256;
16557 mode = V4SFmode;
16558 break;
16559 case V4DFmode:
16560 extract = gen_avx_vextractf128v4df;
16561 load_unaligned = gen_avx_loadupd256;
16562 store_unaligned = gen_avx_storeupd256;
16563 mode = V2DFmode;
16564 break;
16567 if (MEM_P (op1))
16569 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16571 rtx r = gen_reg_rtx (mode);
16572 m = adjust_address (op1, mode, 0);
16573 emit_move_insn (r, m);
16574 m = adjust_address (op1, mode, 16);
16575 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16576 emit_move_insn (op0, r);
16578 /* Normal *mov<mode>_internal pattern will handle
16579 unaligned loads just fine if misaligned_operand
16580 is true, and without the UNSPEC it can be combined
16581 with arithmetic instructions. */
16582 else if (misaligned_operand (op1, GET_MODE (op1)))
16583 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16584 else
16585 emit_insn (load_unaligned (op0, op1));
16587 else if (MEM_P (op0))
16589 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16591 m = adjust_address (op0, mode, 0);
16592 emit_insn (extract (m, op1, const0_rtx));
16593 m = adjust_address (op0, mode, 16);
16594 emit_insn (extract (m, op1, const1_rtx));
16596 else
16597 emit_insn (store_unaligned (op0, op1));
16599 else
16600 gcc_unreachable ();
16603 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16604 straight to ix86_expand_vector_move. */
16605 /* Code generation for scalar reg-reg moves of single and double precision data:
16606 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16607 movaps reg, reg
16608 else
16609 movss reg, reg
16610 if (x86_sse_partial_reg_dependency == true)
16611 movapd reg, reg
16612 else
16613 movsd reg, reg
16615 Code generation for scalar loads of double precision data:
16616 if (x86_sse_split_regs == true)
16617 movlpd mem, reg (gas syntax)
16618 else
16619 movsd mem, reg
16621 Code generation for unaligned packed loads of single precision data
16622 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16623 if (x86_sse_unaligned_move_optimal)
16624 movups mem, reg
16626 if (x86_sse_partial_reg_dependency == true)
16628 xorps reg, reg
16629 movlps mem, reg
16630 movhps mem+8, reg
16632 else
16634 movlps mem, reg
16635 movhps mem+8, reg
16638 Code generation for unaligned packed loads of double precision data
16639 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16640 if (x86_sse_unaligned_move_optimal)
16641 movupd mem, reg
16643 if (x86_sse_split_regs == true)
16645 movlpd mem, reg
16646 movhpd mem+8, reg
16648 else
16650 movsd mem, reg
16651 movhpd mem+8, reg
16655 void
16656 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16658 rtx op0, op1, orig_op0 = NULL_RTX, m;
16659 rtx (*load_unaligned) (rtx, rtx);
16660 rtx (*store_unaligned) (rtx, rtx);
16662 op0 = operands[0];
16663 op1 = operands[1];
16665 if (GET_MODE_SIZE (mode) == 64)
16667 switch (GET_MODE_CLASS (mode))
16669 case MODE_VECTOR_INT:
16670 case MODE_INT:
16671 if (GET_MODE (op0) != V16SImode)
16673 if (!MEM_P (op0))
16675 orig_op0 = op0;
16676 op0 = gen_reg_rtx (V16SImode);
16678 else
16679 op0 = gen_lowpart (V16SImode, op0);
16681 op1 = gen_lowpart (V16SImode, op1);
16682 /* FALLTHRU */
16684 case MODE_VECTOR_FLOAT:
16685 switch (GET_MODE (op0))
16687 default:
16688 gcc_unreachable ();
16689 case V16SImode:
16690 load_unaligned = gen_avx512f_loaddquv16si;
16691 store_unaligned = gen_avx512f_storedquv16si;
16692 break;
16693 case V16SFmode:
16694 load_unaligned = gen_avx512f_loadups512;
16695 store_unaligned = gen_avx512f_storeups512;
16696 break;
16697 case V8DFmode:
16698 load_unaligned = gen_avx512f_loadupd512;
16699 store_unaligned = gen_avx512f_storeupd512;
16700 break;
16703 if (MEM_P (op1))
16704 emit_insn (load_unaligned (op0, op1));
16705 else if (MEM_P (op0))
16706 emit_insn (store_unaligned (op0, op1));
16707 else
16708 gcc_unreachable ();
16709 if (orig_op0)
16710 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16711 break;
16713 default:
16714 gcc_unreachable ();
16717 return;
16720 if (TARGET_AVX
16721 && GET_MODE_SIZE (mode) == 32)
16723 switch (GET_MODE_CLASS (mode))
16725 case MODE_VECTOR_INT:
16726 case MODE_INT:
16727 if (GET_MODE (op0) != V32QImode)
16729 if (!MEM_P (op0))
16731 orig_op0 = op0;
16732 op0 = gen_reg_rtx (V32QImode);
16734 else
16735 op0 = gen_lowpart (V32QImode, op0);
16737 op1 = gen_lowpart (V32QImode, op1);
16738 /* FALLTHRU */
16740 case MODE_VECTOR_FLOAT:
16741 ix86_avx256_split_vector_move_misalign (op0, op1);
16742 if (orig_op0)
16743 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16744 break;
16746 default:
16747 gcc_unreachable ();
16750 return;
16753 if (MEM_P (op1))
16755 /* Normal *mov<mode>_internal pattern will handle
16756 unaligned loads just fine if misaligned_operand
16757 is true, and without the UNSPEC it can be combined
16758 with arithmetic instructions. */
16759 if (TARGET_AVX
16760 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16761 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16762 && misaligned_operand (op1, GET_MODE (op1)))
16763 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16764 /* ??? If we have typed data, then it would appear that using
16765 movdqu is the only way to get unaligned data loaded with
16766 integer type. */
16767 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16769 if (GET_MODE (op0) != V16QImode)
16771 orig_op0 = op0;
16772 op0 = gen_reg_rtx (V16QImode);
16774 op1 = gen_lowpart (V16QImode, op1);
16775 /* We will eventually emit movups based on insn attributes. */
16776 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16777 if (orig_op0)
16778 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16780 else if (TARGET_SSE2 && mode == V2DFmode)
16782 rtx zero;
16784 if (TARGET_AVX
16785 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16786 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16787 || optimize_insn_for_size_p ())
16789 /* We will eventually emit movups based on insn attributes. */
16790 emit_insn (gen_sse2_loadupd (op0, op1));
16791 return;
16794 /* When SSE registers are split into halves, we can avoid
16795 writing to the top half twice. */
16796 if (TARGET_SSE_SPLIT_REGS)
16798 emit_clobber (op0);
16799 zero = op0;
16801 else
16803 /* ??? Not sure about the best option for the Intel chips.
16804 The following would seem to satisfy; the register is
16805 entirely cleared, breaking the dependency chain. We
16806 then store to the upper half, with a dependency depth
16807 of one. A rumor has it that Intel recommends two movsd
16808 followed by an unpacklpd, but this is unconfirmed. And
16809 given that the dependency depth of the unpacklpd would
16810 still be one, I'm not sure why this would be better. */
16811 zero = CONST0_RTX (V2DFmode);
16814 m = adjust_address (op1, DFmode, 0);
16815 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16816 m = adjust_address (op1, DFmode, 8);
16817 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16819 else
16821 rtx t;
16823 if (TARGET_AVX
16824 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16825 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16826 || optimize_insn_for_size_p ())
16828 if (GET_MODE (op0) != V4SFmode)
16830 orig_op0 = op0;
16831 op0 = gen_reg_rtx (V4SFmode);
16833 op1 = gen_lowpart (V4SFmode, op1);
16834 emit_insn (gen_sse_loadups (op0, op1));
16835 if (orig_op0)
16836 emit_move_insn (orig_op0,
16837 gen_lowpart (GET_MODE (orig_op0), op0));
16838 return;
16841 if (mode != V4SFmode)
16842 t = gen_reg_rtx (V4SFmode);
16843 else
16844 t = op0;
16846 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16847 emit_move_insn (t, CONST0_RTX (V4SFmode));
16848 else
16849 emit_clobber (t);
16851 m = adjust_address (op1, V2SFmode, 0);
16852 emit_insn (gen_sse_loadlps (t, t, m));
16853 m = adjust_address (op1, V2SFmode, 8);
16854 emit_insn (gen_sse_loadhps (t, t, m));
16855 if (mode != V4SFmode)
16856 emit_move_insn (op0, gen_lowpart (mode, t));
16859 else if (MEM_P (op0))
16861 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16863 op0 = gen_lowpart (V16QImode, op0);
16864 op1 = gen_lowpart (V16QImode, op1);
16865 /* We will eventually emit movups based on insn attributes. */
16866 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16868 else if (TARGET_SSE2 && mode == V2DFmode)
16870 if (TARGET_AVX
16871 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16872 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16873 || optimize_insn_for_size_p ())
16874 /* We will eventually emit movups based on insn attributes. */
16875 emit_insn (gen_sse2_storeupd (op0, op1));
16876 else
16878 m = adjust_address (op0, DFmode, 0);
16879 emit_insn (gen_sse2_storelpd (m, op1));
16880 m = adjust_address (op0, DFmode, 8);
16881 emit_insn (gen_sse2_storehpd (m, op1));
16884 else
16886 if (mode != V4SFmode)
16887 op1 = gen_lowpart (V4SFmode, op1);
16889 if (TARGET_AVX
16890 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16891 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16892 || optimize_insn_for_size_p ())
16894 op0 = gen_lowpart (V4SFmode, op0);
16895 emit_insn (gen_sse_storeups (op0, op1));
16897 else
16899 m = adjust_address (op0, V2SFmode, 0);
16900 emit_insn (gen_sse_storelps (m, op1));
16901 m = adjust_address (op0, V2SFmode, 8);
16902 emit_insn (gen_sse_storehps (m, op1));
16906 else
16907 gcc_unreachable ();
16910 /* Expand a push in MODE. This is some mode for which we do not support
16911 proper push instructions, at least from the registers that we expect
16912 the value to live in. */
16914 void
16915 ix86_expand_push (enum machine_mode mode, rtx x)
16917 rtx tmp;
16919 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16920 GEN_INT (-GET_MODE_SIZE (mode)),
16921 stack_pointer_rtx, 1, OPTAB_DIRECT);
16922 if (tmp != stack_pointer_rtx)
16923 emit_move_insn (stack_pointer_rtx, tmp);
16925 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16927 /* When we push an operand onto stack, it has to be aligned at least
16928 at the function argument boundary. However since we don't have
16929 the argument type, we can't determine the actual argument
16930 boundary. */
16931 emit_move_insn (tmp, x);
16934 /* Helper function of ix86_fixup_binary_operands to canonicalize
16935 operand order. Returns true if the operands should be swapped. */
16937 static bool
16938 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16939 rtx operands[])
16941 rtx dst = operands[0];
16942 rtx src1 = operands[1];
16943 rtx src2 = operands[2];
16945 /* If the operation is not commutative, we can't do anything. */
16946 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16947 return false;
16949 /* Highest priority is that src1 should match dst. */
16950 if (rtx_equal_p (dst, src1))
16951 return false;
16952 if (rtx_equal_p (dst, src2))
16953 return true;
16955 /* Next highest priority is that immediate constants come second. */
16956 if (immediate_operand (src2, mode))
16957 return false;
16958 if (immediate_operand (src1, mode))
16959 return true;
16961 /* Lowest priority is that memory references should come second. */
16962 if (MEM_P (src2))
16963 return false;
16964 if (MEM_P (src1))
16965 return true;
16967 return false;
16971 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16972 destination to use for the operation. If different from the true
16973 destination in operands[0], a copy operation will be required. */
16976 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16977 rtx operands[])
16979 rtx dst = operands[0];
16980 rtx src1 = operands[1];
16981 rtx src2 = operands[2];
16983 /* Canonicalize operand order. */
16984 if (ix86_swap_binary_operands_p (code, mode, operands))
16986 rtx temp;
16988 /* It is invalid to swap operands of different modes. */
16989 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16991 temp = src1;
16992 src1 = src2;
16993 src2 = temp;
16996 /* Both source operands cannot be in memory. */
16997 if (MEM_P (src1) && MEM_P (src2))
16999 /* Optimization: Only read from memory once. */
17000 if (rtx_equal_p (src1, src2))
17002 src2 = force_reg (mode, src2);
17003 src1 = src2;
17005 else if (rtx_equal_p (dst, src1))
17006 src2 = force_reg (mode, src2);
17007 else
17008 src1 = force_reg (mode, src1);
17011 /* If the destination is memory, and we do not have matching source
17012 operands, do things in registers. */
17013 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17014 dst = gen_reg_rtx (mode);
17016 /* Source 1 cannot be a constant. */
17017 if (CONSTANT_P (src1))
17018 src1 = force_reg (mode, src1);
17020 /* Source 1 cannot be a non-matching memory. */
17021 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17022 src1 = force_reg (mode, src1);
17024 /* Improve address combine. */
17025 if (code == PLUS
17026 && GET_MODE_CLASS (mode) == MODE_INT
17027 && MEM_P (src2))
17028 src2 = force_reg (mode, src2);
17030 operands[1] = src1;
17031 operands[2] = src2;
17032 return dst;
17035 /* Similarly, but assume that the destination has already been
17036 set up properly. */
17038 void
17039 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17040 enum machine_mode mode, rtx operands[])
17042 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17043 gcc_assert (dst == operands[0]);
17046 /* Attempt to expand a binary operator. Make the expansion closer to the
17047 actual machine, then just general_operand, which will allow 3 separate
17048 memory references (one output, two input) in a single insn. */
17050 void
17051 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17052 rtx operands[])
17054 rtx src1, src2, dst, op, clob;
17056 dst = ix86_fixup_binary_operands (code, mode, operands);
17057 src1 = operands[1];
17058 src2 = operands[2];
17060 /* Emit the instruction. */
17062 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17063 if (reload_in_progress)
17065 /* Reload doesn't know about the flags register, and doesn't know that
17066 it doesn't want to clobber it. We can only do this with PLUS. */
17067 gcc_assert (code == PLUS);
17068 emit_insn (op);
17070 else if (reload_completed
17071 && code == PLUS
17072 && !rtx_equal_p (dst, src1))
17074 /* This is going to be an LEA; avoid splitting it later. */
17075 emit_insn (op);
17077 else
17079 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17080 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17083 /* Fix up the destination if needed. */
17084 if (dst != operands[0])
17085 emit_move_insn (operands[0], dst);
17088 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17089 the given OPERANDS. */
17091 void
17092 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17093 rtx operands[])
17095 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17096 if (GET_CODE (operands[1]) == SUBREG)
17098 op1 = operands[1];
17099 op2 = operands[2];
17101 else if (GET_CODE (operands[2]) == SUBREG)
17103 op1 = operands[2];
17104 op2 = operands[1];
17106 /* Optimize (__m128i) d | (__m128i) e and similar code
17107 when d and e are float vectors into float vector logical
17108 insn. In C/C++ without using intrinsics there is no other way
17109 to express vector logical operation on float vectors than
17110 to cast them temporarily to integer vectors. */
17111 if (op1
17112 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17113 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17114 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17115 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17116 && SUBREG_BYTE (op1) == 0
17117 && (GET_CODE (op2) == CONST_VECTOR
17118 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17119 && SUBREG_BYTE (op2) == 0))
17120 && can_create_pseudo_p ())
17122 rtx dst;
17123 switch (GET_MODE (SUBREG_REG (op1)))
17125 case V4SFmode:
17126 case V8SFmode:
17127 case V2DFmode:
17128 case V4DFmode:
17129 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17130 if (GET_CODE (op2) == CONST_VECTOR)
17132 op2 = gen_lowpart (GET_MODE (dst), op2);
17133 op2 = force_reg (GET_MODE (dst), op2);
17135 else
17137 op1 = operands[1];
17138 op2 = SUBREG_REG (operands[2]);
17139 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17140 op2 = force_reg (GET_MODE (dst), op2);
17142 op1 = SUBREG_REG (op1);
17143 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17144 op1 = force_reg (GET_MODE (dst), op1);
17145 emit_insn (gen_rtx_SET (VOIDmode, dst,
17146 gen_rtx_fmt_ee (code, GET_MODE (dst),
17147 op1, op2)));
17148 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17149 return;
17150 default:
17151 break;
17154 if (!nonimmediate_operand (operands[1], mode))
17155 operands[1] = force_reg (mode, operands[1]);
17156 if (!nonimmediate_operand (operands[2], mode))
17157 operands[2] = force_reg (mode, operands[2]);
17158 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17159 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17160 gen_rtx_fmt_ee (code, mode, operands[1],
17161 operands[2])));
17164 /* Return TRUE or FALSE depending on whether the binary operator meets the
17165 appropriate constraints. */
17167 bool
17168 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17169 rtx operands[3])
17171 rtx dst = operands[0];
17172 rtx src1 = operands[1];
17173 rtx src2 = operands[2];
17175 /* Both source operands cannot be in memory. */
17176 if (MEM_P (src1) && MEM_P (src2))
17177 return false;
17179 /* Canonicalize operand order for commutative operators. */
17180 if (ix86_swap_binary_operands_p (code, mode, operands))
17182 rtx temp = src1;
17183 src1 = src2;
17184 src2 = temp;
17187 /* If the destination is memory, we must have a matching source operand. */
17188 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17189 return false;
17191 /* Source 1 cannot be a constant. */
17192 if (CONSTANT_P (src1))
17193 return false;
17195 /* Source 1 cannot be a non-matching memory. */
17196 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17197 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17198 return (code == AND
17199 && (mode == HImode
17200 || mode == SImode
17201 || (TARGET_64BIT && mode == DImode))
17202 && satisfies_constraint_L (src2));
17204 return true;
17207 /* Attempt to expand a unary operator. Make the expansion closer to the
17208 actual machine, then just general_operand, which will allow 2 separate
17209 memory references (one output, one input) in a single insn. */
17211 void
17212 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17213 rtx operands[])
17215 int matching_memory;
17216 rtx src, dst, op, clob;
17218 dst = operands[0];
17219 src = operands[1];
17221 /* If the destination is memory, and we do not have matching source
17222 operands, do things in registers. */
17223 matching_memory = 0;
17224 if (MEM_P (dst))
17226 if (rtx_equal_p (dst, src))
17227 matching_memory = 1;
17228 else
17229 dst = gen_reg_rtx (mode);
17232 /* When source operand is memory, destination must match. */
17233 if (MEM_P (src) && !matching_memory)
17234 src = force_reg (mode, src);
17236 /* Emit the instruction. */
17238 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17239 if (reload_in_progress || code == NOT)
17241 /* Reload doesn't know about the flags register, and doesn't know that
17242 it doesn't want to clobber it. */
17243 gcc_assert (code == NOT);
17244 emit_insn (op);
17246 else
17248 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17249 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17252 /* Fix up the destination if needed. */
17253 if (dst != operands[0])
17254 emit_move_insn (operands[0], dst);
17257 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17258 divisor are within the range [0-255]. */
17260 void
17261 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17262 bool signed_p)
17264 rtx end_label, qimode_label;
17265 rtx insn, div, mod;
17266 rtx scratch, tmp0, tmp1, tmp2;
17267 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17268 rtx (*gen_zero_extend) (rtx, rtx);
17269 rtx (*gen_test_ccno_1) (rtx, rtx);
17271 switch (mode)
17273 case SImode:
17274 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17275 gen_test_ccno_1 = gen_testsi_ccno_1;
17276 gen_zero_extend = gen_zero_extendqisi2;
17277 break;
17278 case DImode:
17279 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17280 gen_test_ccno_1 = gen_testdi_ccno_1;
17281 gen_zero_extend = gen_zero_extendqidi2;
17282 break;
17283 default:
17284 gcc_unreachable ();
17287 end_label = gen_label_rtx ();
17288 qimode_label = gen_label_rtx ();
17290 scratch = gen_reg_rtx (mode);
17292 /* Use 8bit unsigned divimod if dividend and divisor are within
17293 the range [0-255]. */
17294 emit_move_insn (scratch, operands[2]);
17295 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17296 scratch, 1, OPTAB_DIRECT);
17297 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17298 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17299 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17300 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17301 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17302 pc_rtx);
17303 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17304 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17305 JUMP_LABEL (insn) = qimode_label;
17307 /* Generate original signed/unsigned divimod. */
17308 div = gen_divmod4_1 (operands[0], operands[1],
17309 operands[2], operands[3]);
17310 emit_insn (div);
17312 /* Branch to the end. */
17313 emit_jump_insn (gen_jump (end_label));
17314 emit_barrier ();
17316 /* Generate 8bit unsigned divide. */
17317 emit_label (qimode_label);
17318 /* Don't use operands[0] for result of 8bit divide since not all
17319 registers support QImode ZERO_EXTRACT. */
17320 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17321 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17322 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17323 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17325 if (signed_p)
17327 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17328 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17330 else
17332 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17333 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17336 /* Extract remainder from AH. */
17337 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17338 if (REG_P (operands[1]))
17339 insn = emit_move_insn (operands[1], tmp1);
17340 else
17342 /* Need a new scratch register since the old one has result
17343 of 8bit divide. */
17344 scratch = gen_reg_rtx (mode);
17345 emit_move_insn (scratch, tmp1);
17346 insn = emit_move_insn (operands[1], scratch);
17348 set_unique_reg_note (insn, REG_EQUAL, mod);
17350 /* Zero extend quotient from AL. */
17351 tmp1 = gen_lowpart (QImode, tmp0);
17352 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17353 set_unique_reg_note (insn, REG_EQUAL, div);
17355 emit_label (end_label);
17358 #define LEA_MAX_STALL (3)
17359 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17361 /* Increase given DISTANCE in half-cycles according to
17362 dependencies between PREV and NEXT instructions.
17363 Add 1 half-cycle if there is no dependency and
17364 go to next cycle if there is some dependecy. */
17366 static unsigned int
17367 increase_distance (rtx prev, rtx next, unsigned int distance)
17369 df_ref *use_rec;
17370 df_ref *def_rec;
17372 if (!prev || !next)
17373 return distance + (distance & 1) + 2;
17375 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17376 return distance + 1;
17378 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17379 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17380 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17381 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17382 return distance + (distance & 1) + 2;
17384 return distance + 1;
17387 /* Function checks if instruction INSN defines register number
17388 REGNO1 or REGNO2. */
17390 static bool
17391 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17392 rtx insn)
17394 df_ref *def_rec;
17396 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17397 if (DF_REF_REG_DEF_P (*def_rec)
17398 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17399 && (regno1 == DF_REF_REGNO (*def_rec)
17400 || regno2 == DF_REF_REGNO (*def_rec)))
17402 return true;
17405 return false;
17408 /* Function checks if instruction INSN uses register number
17409 REGNO as a part of address expression. */
17411 static bool
17412 insn_uses_reg_mem (unsigned int regno, rtx insn)
17414 df_ref *use_rec;
17416 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17417 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17418 return true;
17420 return false;
17423 /* Search backward for non-agu definition of register number REGNO1
17424 or register number REGNO2 in basic block starting from instruction
17425 START up to head of basic block or instruction INSN.
17427 Function puts true value into *FOUND var if definition was found
17428 and false otherwise.
17430 Distance in half-cycles between START and found instruction or head
17431 of BB is added to DISTANCE and returned. */
17433 static int
17434 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17435 rtx insn, int distance,
17436 rtx start, bool *found)
17438 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17439 rtx prev = start;
17440 rtx next = NULL;
17442 *found = false;
17444 while (prev
17445 && prev != insn
17446 && distance < LEA_SEARCH_THRESHOLD)
17448 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17450 distance = increase_distance (prev, next, distance);
17451 if (insn_defines_reg (regno1, regno2, prev))
17453 if (recog_memoized (prev) < 0
17454 || get_attr_type (prev) != TYPE_LEA)
17456 *found = true;
17457 return distance;
17461 next = prev;
17463 if (prev == BB_HEAD (bb))
17464 break;
17466 prev = PREV_INSN (prev);
17469 return distance;
17472 /* Search backward for non-agu definition of register number REGNO1
17473 or register number REGNO2 in INSN's basic block until
17474 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17475 2. Reach neighbour BBs boundary, or
17476 3. Reach agu definition.
17477 Returns the distance between the non-agu definition point and INSN.
17478 If no definition point, returns -1. */
17480 static int
17481 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17482 rtx insn)
17484 basic_block bb = BLOCK_FOR_INSN (insn);
17485 int distance = 0;
17486 bool found = false;
17488 if (insn != BB_HEAD (bb))
17489 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17490 distance, PREV_INSN (insn),
17491 &found);
17493 if (!found && distance < LEA_SEARCH_THRESHOLD)
17495 edge e;
17496 edge_iterator ei;
17497 bool simple_loop = false;
17499 FOR_EACH_EDGE (e, ei, bb->preds)
17500 if (e->src == bb)
17502 simple_loop = true;
17503 break;
17506 if (simple_loop)
17507 distance = distance_non_agu_define_in_bb (regno1, regno2,
17508 insn, distance,
17509 BB_END (bb), &found);
17510 else
17512 int shortest_dist = -1;
17513 bool found_in_bb = false;
17515 FOR_EACH_EDGE (e, ei, bb->preds)
17517 int bb_dist
17518 = distance_non_agu_define_in_bb (regno1, regno2,
17519 insn, distance,
17520 BB_END (e->src),
17521 &found_in_bb);
17522 if (found_in_bb)
17524 if (shortest_dist < 0)
17525 shortest_dist = bb_dist;
17526 else if (bb_dist > 0)
17527 shortest_dist = MIN (bb_dist, shortest_dist);
17529 found = true;
17533 distance = shortest_dist;
17537 /* get_attr_type may modify recog data. We want to make sure
17538 that recog data is valid for instruction INSN, on which
17539 distance_non_agu_define is called. INSN is unchanged here. */
17540 extract_insn_cached (insn);
17542 if (!found)
17543 return -1;
17545 return distance >> 1;
17548 /* Return the distance in half-cycles between INSN and the next
17549 insn that uses register number REGNO in memory address added
17550 to DISTANCE. Return -1 if REGNO0 is set.
17552 Put true value into *FOUND if register usage was found and
17553 false otherwise.
17554 Put true value into *REDEFINED if register redefinition was
17555 found and false otherwise. */
17557 static int
17558 distance_agu_use_in_bb (unsigned int regno,
17559 rtx insn, int distance, rtx start,
17560 bool *found, bool *redefined)
17562 basic_block bb = NULL;
17563 rtx next = start;
17564 rtx prev = NULL;
17566 *found = false;
17567 *redefined = false;
17569 if (start != NULL_RTX)
17571 bb = BLOCK_FOR_INSN (start);
17572 if (start != BB_HEAD (bb))
17573 /* If insn and start belong to the same bb, set prev to insn,
17574 so the call to increase_distance will increase the distance
17575 between insns by 1. */
17576 prev = insn;
17579 while (next
17580 && next != insn
17581 && distance < LEA_SEARCH_THRESHOLD)
17583 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17585 distance = increase_distance(prev, next, distance);
17586 if (insn_uses_reg_mem (regno, next))
17588 /* Return DISTANCE if OP0 is used in memory
17589 address in NEXT. */
17590 *found = true;
17591 return distance;
17594 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17596 /* Return -1 if OP0 is set in NEXT. */
17597 *redefined = true;
17598 return -1;
17601 prev = next;
17604 if (next == BB_END (bb))
17605 break;
17607 next = NEXT_INSN (next);
17610 return distance;
17613 /* Return the distance between INSN and the next insn that uses
17614 register number REGNO0 in memory address. Return -1 if no such
17615 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17617 static int
17618 distance_agu_use (unsigned int regno0, rtx insn)
17620 basic_block bb = BLOCK_FOR_INSN (insn);
17621 int distance = 0;
17622 bool found = false;
17623 bool redefined = false;
17625 if (insn != BB_END (bb))
17626 distance = distance_agu_use_in_bb (regno0, insn, distance,
17627 NEXT_INSN (insn),
17628 &found, &redefined);
17630 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17632 edge e;
17633 edge_iterator ei;
17634 bool simple_loop = false;
17636 FOR_EACH_EDGE (e, ei, bb->succs)
17637 if (e->dest == bb)
17639 simple_loop = true;
17640 break;
17643 if (simple_loop)
17644 distance = distance_agu_use_in_bb (regno0, insn,
17645 distance, BB_HEAD (bb),
17646 &found, &redefined);
17647 else
17649 int shortest_dist = -1;
17650 bool found_in_bb = false;
17651 bool redefined_in_bb = false;
17653 FOR_EACH_EDGE (e, ei, bb->succs)
17655 int bb_dist
17656 = distance_agu_use_in_bb (regno0, insn,
17657 distance, BB_HEAD (e->dest),
17658 &found_in_bb, &redefined_in_bb);
17659 if (found_in_bb)
17661 if (shortest_dist < 0)
17662 shortest_dist = bb_dist;
17663 else if (bb_dist > 0)
17664 shortest_dist = MIN (bb_dist, shortest_dist);
17666 found = true;
17670 distance = shortest_dist;
17674 if (!found || redefined)
17675 return -1;
17677 return distance >> 1;
17680 /* Define this macro to tune LEA priority vs ADD, it take effect when
17681 there is a dilemma of choicing LEA or ADD
17682 Negative value: ADD is more preferred than LEA
17683 Zero: Netrual
17684 Positive value: LEA is more preferred than ADD*/
17685 #define IX86_LEA_PRIORITY 0
17687 /* Return true if usage of lea INSN has performance advantage
17688 over a sequence of instructions. Instructions sequence has
17689 SPLIT_COST cycles higher latency than lea latency. */
17691 static bool
17692 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17693 unsigned int regno2, int split_cost, bool has_scale)
17695 int dist_define, dist_use;
17697 /* For Silvermont if using a 2-source or 3-source LEA for
17698 non-destructive destination purposes, or due to wanting
17699 ability to use SCALE, the use of LEA is justified. */
17700 if (ix86_tune == PROCESSOR_SLM)
17702 if (has_scale)
17703 return true;
17704 if (split_cost < 1)
17705 return false;
17706 if (regno0 == regno1 || regno0 == regno2)
17707 return false;
17708 return true;
17711 dist_define = distance_non_agu_define (regno1, regno2, insn);
17712 dist_use = distance_agu_use (regno0, insn);
17714 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17716 /* If there is no non AGU operand definition, no AGU
17717 operand usage and split cost is 0 then both lea
17718 and non lea variants have same priority. Currently
17719 we prefer lea for 64 bit code and non lea on 32 bit
17720 code. */
17721 if (dist_use < 0 && split_cost == 0)
17722 return TARGET_64BIT || IX86_LEA_PRIORITY;
17723 else
17724 return true;
17727 /* With longer definitions distance lea is more preferable.
17728 Here we change it to take into account splitting cost and
17729 lea priority. */
17730 dist_define += split_cost + IX86_LEA_PRIORITY;
17732 /* If there is no use in memory addess then we just check
17733 that split cost exceeds AGU stall. */
17734 if (dist_use < 0)
17735 return dist_define > LEA_MAX_STALL;
17737 /* If this insn has both backward non-agu dependence and forward
17738 agu dependence, the one with short distance takes effect. */
17739 return dist_define >= dist_use;
17742 /* Return true if it is legal to clobber flags by INSN and
17743 false otherwise. */
17745 static bool
17746 ix86_ok_to_clobber_flags (rtx insn)
17748 basic_block bb = BLOCK_FOR_INSN (insn);
17749 df_ref *use;
17750 bitmap live;
17752 while (insn)
17754 if (NONDEBUG_INSN_P (insn))
17756 for (use = DF_INSN_USES (insn); *use; use++)
17757 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17758 return false;
17760 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17761 return true;
17764 if (insn == BB_END (bb))
17765 break;
17767 insn = NEXT_INSN (insn);
17770 live = df_get_live_out(bb);
17771 return !REGNO_REG_SET_P (live, FLAGS_REG);
17774 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17775 move and add to avoid AGU stalls. */
17777 bool
17778 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17780 unsigned int regno0, regno1, regno2;
17782 /* Check if we need to optimize. */
17783 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17784 return false;
17786 /* Check it is correct to split here. */
17787 if (!ix86_ok_to_clobber_flags(insn))
17788 return false;
17790 regno0 = true_regnum (operands[0]);
17791 regno1 = true_regnum (operands[1]);
17792 regno2 = true_regnum (operands[2]);
17794 /* We need to split only adds with non destructive
17795 destination operand. */
17796 if (regno0 == regno1 || regno0 == regno2)
17797 return false;
17798 else
17799 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17802 /* Return true if we should emit lea instruction instead of mov
17803 instruction. */
17805 bool
17806 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17808 unsigned int regno0, regno1;
17810 /* Check if we need to optimize. */
17811 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17812 return false;
17814 /* Use lea for reg to reg moves only. */
17815 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17816 return false;
17818 regno0 = true_regnum (operands[0]);
17819 regno1 = true_regnum (operands[1]);
17821 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17824 /* Return true if we need to split lea into a sequence of
17825 instructions to avoid AGU stalls. */
17827 bool
17828 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17830 unsigned int regno0, regno1, regno2;
17831 int split_cost;
17832 struct ix86_address parts;
17833 int ok;
17835 /* Check we need to optimize. */
17836 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17837 return false;
17839 /* Check it is correct to split here. */
17840 if (!ix86_ok_to_clobber_flags(insn))
17841 return false;
17843 ok = ix86_decompose_address (operands[1], &parts);
17844 gcc_assert (ok);
17846 /* There should be at least two components in the address. */
17847 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17848 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17849 return false;
17851 /* We should not split into add if non legitimate pic
17852 operand is used as displacement. */
17853 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17854 return false;
17856 regno0 = true_regnum (operands[0]) ;
17857 regno1 = INVALID_REGNUM;
17858 regno2 = INVALID_REGNUM;
17860 if (parts.base)
17861 regno1 = true_regnum (parts.base);
17862 if (parts.index)
17863 regno2 = true_regnum (parts.index);
17865 split_cost = 0;
17867 /* Compute how many cycles we will add to execution time
17868 if split lea into a sequence of instructions. */
17869 if (parts.base || parts.index)
17871 /* Have to use mov instruction if non desctructive
17872 destination form is used. */
17873 if (regno1 != regno0 && regno2 != regno0)
17874 split_cost += 1;
17876 /* Have to add index to base if both exist. */
17877 if (parts.base && parts.index)
17878 split_cost += 1;
17880 /* Have to use shift and adds if scale is 2 or greater. */
17881 if (parts.scale > 1)
17883 if (regno0 != regno1)
17884 split_cost += 1;
17885 else if (regno2 == regno0)
17886 split_cost += 4;
17887 else
17888 split_cost += parts.scale;
17891 /* Have to use add instruction with immediate if
17892 disp is non zero. */
17893 if (parts.disp && parts.disp != const0_rtx)
17894 split_cost += 1;
17896 /* Subtract the price of lea. */
17897 split_cost -= 1;
17900 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17901 parts.scale > 1);
17904 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17905 matches destination. RTX includes clobber of FLAGS_REG. */
17907 static void
17908 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17909 rtx dst, rtx src)
17911 rtx op, clob;
17913 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17914 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17916 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17919 /* Return true if regno1 def is nearest to the insn. */
17921 static bool
17922 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17924 rtx prev = insn;
17925 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17927 if (insn == start)
17928 return false;
17929 while (prev && prev != start)
17931 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17933 prev = PREV_INSN (prev);
17934 continue;
17936 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17937 return true;
17938 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17939 return false;
17940 prev = PREV_INSN (prev);
17943 /* None of the regs is defined in the bb. */
17944 return false;
17947 /* Split lea instructions into a sequence of instructions
17948 which are executed on ALU to avoid AGU stalls.
17949 It is assumed that it is allowed to clobber flags register
17950 at lea position. */
17952 void
17953 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17955 unsigned int regno0, regno1, regno2;
17956 struct ix86_address parts;
17957 rtx target, tmp;
17958 int ok, adds;
17960 ok = ix86_decompose_address (operands[1], &parts);
17961 gcc_assert (ok);
17963 target = gen_lowpart (mode, operands[0]);
17965 regno0 = true_regnum (target);
17966 regno1 = INVALID_REGNUM;
17967 regno2 = INVALID_REGNUM;
17969 if (parts.base)
17971 parts.base = gen_lowpart (mode, parts.base);
17972 regno1 = true_regnum (parts.base);
17975 if (parts.index)
17977 parts.index = gen_lowpart (mode, parts.index);
17978 regno2 = true_regnum (parts.index);
17981 if (parts.disp)
17982 parts.disp = gen_lowpart (mode, parts.disp);
17984 if (parts.scale > 1)
17986 /* Case r1 = r1 + ... */
17987 if (regno1 == regno0)
17989 /* If we have a case r1 = r1 + C * r1 then we
17990 should use multiplication which is very
17991 expensive. Assume cost model is wrong if we
17992 have such case here. */
17993 gcc_assert (regno2 != regno0);
17995 for (adds = parts.scale; adds > 0; adds--)
17996 ix86_emit_binop (PLUS, mode, target, parts.index);
17998 else
18000 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18001 if (regno0 != regno2)
18002 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18004 /* Use shift for scaling. */
18005 ix86_emit_binop (ASHIFT, mode, target,
18006 GEN_INT (exact_log2 (parts.scale)));
18008 if (parts.base)
18009 ix86_emit_binop (PLUS, mode, target, parts.base);
18011 if (parts.disp && parts.disp != const0_rtx)
18012 ix86_emit_binop (PLUS, mode, target, parts.disp);
18015 else if (!parts.base && !parts.index)
18017 gcc_assert(parts.disp);
18018 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18020 else
18022 if (!parts.base)
18024 if (regno0 != regno2)
18025 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18027 else if (!parts.index)
18029 if (regno0 != regno1)
18030 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18032 else
18034 if (regno0 == regno1)
18035 tmp = parts.index;
18036 else if (regno0 == regno2)
18037 tmp = parts.base;
18038 else
18040 rtx tmp1;
18042 /* Find better operand for SET instruction, depending
18043 on which definition is farther from the insn. */
18044 if (find_nearest_reg_def (insn, regno1, regno2))
18045 tmp = parts.index, tmp1 = parts.base;
18046 else
18047 tmp = parts.base, tmp1 = parts.index;
18049 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18051 if (parts.disp && parts.disp != const0_rtx)
18052 ix86_emit_binop (PLUS, mode, target, parts.disp);
18054 ix86_emit_binop (PLUS, mode, target, tmp1);
18055 return;
18058 ix86_emit_binop (PLUS, mode, target, tmp);
18061 if (parts.disp && parts.disp != const0_rtx)
18062 ix86_emit_binop (PLUS, mode, target, parts.disp);
18066 /* Return true if it is ok to optimize an ADD operation to LEA
18067 operation to avoid flag register consumation. For most processors,
18068 ADD is faster than LEA. For the processors like ATOM, if the
18069 destination register of LEA holds an actual address which will be
18070 used soon, LEA is better and otherwise ADD is better. */
18072 bool
18073 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18075 unsigned int regno0 = true_regnum (operands[0]);
18076 unsigned int regno1 = true_regnum (operands[1]);
18077 unsigned int regno2 = true_regnum (operands[2]);
18079 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18080 if (regno0 != regno1 && regno0 != regno2)
18081 return true;
18083 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18084 return false;
18086 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18089 /* Return true if destination reg of SET_BODY is shift count of
18090 USE_BODY. */
18092 static bool
18093 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18095 rtx set_dest;
18096 rtx shift_rtx;
18097 int i;
18099 /* Retrieve destination of SET_BODY. */
18100 switch (GET_CODE (set_body))
18102 case SET:
18103 set_dest = SET_DEST (set_body);
18104 if (!set_dest || !REG_P (set_dest))
18105 return false;
18106 break;
18107 case PARALLEL:
18108 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18109 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18110 use_body))
18111 return true;
18112 default:
18113 return false;
18114 break;
18117 /* Retrieve shift count of USE_BODY. */
18118 switch (GET_CODE (use_body))
18120 case SET:
18121 shift_rtx = XEXP (use_body, 1);
18122 break;
18123 case PARALLEL:
18124 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18125 if (ix86_dep_by_shift_count_body (set_body,
18126 XVECEXP (use_body, 0, i)))
18127 return true;
18128 default:
18129 return false;
18130 break;
18133 if (shift_rtx
18134 && (GET_CODE (shift_rtx) == ASHIFT
18135 || GET_CODE (shift_rtx) == LSHIFTRT
18136 || GET_CODE (shift_rtx) == ASHIFTRT
18137 || GET_CODE (shift_rtx) == ROTATE
18138 || GET_CODE (shift_rtx) == ROTATERT))
18140 rtx shift_count = XEXP (shift_rtx, 1);
18142 /* Return true if shift count is dest of SET_BODY. */
18143 if (REG_P (shift_count))
18145 /* Add check since it can be invoked before register
18146 allocation in pre-reload schedule. */
18147 if (reload_completed
18148 && true_regnum (set_dest) == true_regnum (shift_count))
18149 return true;
18150 else if (REGNO(set_dest) == REGNO(shift_count))
18151 return true;
18155 return false;
18158 /* Return true if destination reg of SET_INSN is shift count of
18159 USE_INSN. */
18161 bool
18162 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18164 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18165 PATTERN (use_insn));
18168 /* Return TRUE or FALSE depending on whether the unary operator meets the
18169 appropriate constraints. */
18171 bool
18172 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18173 enum machine_mode mode ATTRIBUTE_UNUSED,
18174 rtx operands[2])
18176 /* If one of operands is memory, source and destination must match. */
18177 if ((MEM_P (operands[0])
18178 || MEM_P (operands[1]))
18179 && ! rtx_equal_p (operands[0], operands[1]))
18180 return false;
18181 return true;
18184 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18185 are ok, keeping in mind the possible movddup alternative. */
18187 bool
18188 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18190 if (MEM_P (operands[0]))
18191 return rtx_equal_p (operands[0], operands[1 + high]);
18192 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18193 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18194 return true;
18197 /* Post-reload splitter for converting an SF or DFmode value in an
18198 SSE register into an unsigned SImode. */
18200 void
18201 ix86_split_convert_uns_si_sse (rtx operands[])
18203 enum machine_mode vecmode;
18204 rtx value, large, zero_or_two31, input, two31, x;
18206 large = operands[1];
18207 zero_or_two31 = operands[2];
18208 input = operands[3];
18209 two31 = operands[4];
18210 vecmode = GET_MODE (large);
18211 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18213 /* Load up the value into the low element. We must ensure that the other
18214 elements are valid floats -- zero is the easiest such value. */
18215 if (MEM_P (input))
18217 if (vecmode == V4SFmode)
18218 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18219 else
18220 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18222 else
18224 input = gen_rtx_REG (vecmode, REGNO (input));
18225 emit_move_insn (value, CONST0_RTX (vecmode));
18226 if (vecmode == V4SFmode)
18227 emit_insn (gen_sse_movss (value, value, input));
18228 else
18229 emit_insn (gen_sse2_movsd (value, value, input));
18232 emit_move_insn (large, two31);
18233 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18235 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18236 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18238 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18239 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18241 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18242 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18244 large = gen_rtx_REG (V4SImode, REGNO (large));
18245 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18247 x = gen_rtx_REG (V4SImode, REGNO (value));
18248 if (vecmode == V4SFmode)
18249 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18250 else
18251 emit_insn (gen_sse2_cvttpd2dq (x, value));
18252 value = x;
18254 emit_insn (gen_xorv4si3 (value, value, large));
18257 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18258 Expects the 64-bit DImode to be supplied in a pair of integral
18259 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18260 -mfpmath=sse, !optimize_size only. */
18262 void
18263 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18265 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18266 rtx int_xmm, fp_xmm;
18267 rtx biases, exponents;
18268 rtx x;
18270 int_xmm = gen_reg_rtx (V4SImode);
18271 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18272 emit_insn (gen_movdi_to_sse (int_xmm, input));
18273 else if (TARGET_SSE_SPLIT_REGS)
18275 emit_clobber (int_xmm);
18276 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18278 else
18280 x = gen_reg_rtx (V2DImode);
18281 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18282 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18285 x = gen_rtx_CONST_VECTOR (V4SImode,
18286 gen_rtvec (4, GEN_INT (0x43300000UL),
18287 GEN_INT (0x45300000UL),
18288 const0_rtx, const0_rtx));
18289 exponents = validize_mem (force_const_mem (V4SImode, x));
18291 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18292 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18294 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18295 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18296 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18297 (0x1.0p84 + double(fp_value_hi_xmm)).
18298 Note these exponents differ by 32. */
18300 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18302 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18303 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18304 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18305 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18306 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18307 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18308 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18309 biases = validize_mem (force_const_mem (V2DFmode, biases));
18310 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18312 /* Add the upper and lower DFmode values together. */
18313 if (TARGET_SSE3)
18314 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18315 else
18317 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18318 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18319 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18322 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18325 /* Not used, but eases macroization of patterns. */
18326 void
18327 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18328 rtx input ATTRIBUTE_UNUSED)
18330 gcc_unreachable ();
18333 /* Convert an unsigned SImode value into a DFmode. Only currently used
18334 for SSE, but applicable anywhere. */
18336 void
18337 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18339 REAL_VALUE_TYPE TWO31r;
18340 rtx x, fp;
18342 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18343 NULL, 1, OPTAB_DIRECT);
18345 fp = gen_reg_rtx (DFmode);
18346 emit_insn (gen_floatsidf2 (fp, x));
18348 real_ldexp (&TWO31r, &dconst1, 31);
18349 x = const_double_from_real_value (TWO31r, DFmode);
18351 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18352 if (x != target)
18353 emit_move_insn (target, x);
18356 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18357 32-bit mode; otherwise we have a direct convert instruction. */
18359 void
18360 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18362 REAL_VALUE_TYPE TWO32r;
18363 rtx fp_lo, fp_hi, x;
18365 fp_lo = gen_reg_rtx (DFmode);
18366 fp_hi = gen_reg_rtx (DFmode);
18368 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18370 real_ldexp (&TWO32r, &dconst1, 32);
18371 x = const_double_from_real_value (TWO32r, DFmode);
18372 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18374 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18376 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18377 0, OPTAB_DIRECT);
18378 if (x != target)
18379 emit_move_insn (target, x);
18382 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18383 For x86_32, -mfpmath=sse, !optimize_size only. */
18384 void
18385 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18387 REAL_VALUE_TYPE ONE16r;
18388 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18390 real_ldexp (&ONE16r, &dconst1, 16);
18391 x = const_double_from_real_value (ONE16r, SFmode);
18392 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18393 NULL, 0, OPTAB_DIRECT);
18394 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18395 NULL, 0, OPTAB_DIRECT);
18396 fp_hi = gen_reg_rtx (SFmode);
18397 fp_lo = gen_reg_rtx (SFmode);
18398 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18399 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18400 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18401 0, OPTAB_DIRECT);
18402 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18403 0, OPTAB_DIRECT);
18404 if (!rtx_equal_p (target, fp_hi))
18405 emit_move_insn (target, fp_hi);
18408 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18409 a vector of unsigned ints VAL to vector of floats TARGET. */
18411 void
18412 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18414 rtx tmp[8];
18415 REAL_VALUE_TYPE TWO16r;
18416 enum machine_mode intmode = GET_MODE (val);
18417 enum machine_mode fltmode = GET_MODE (target);
18418 rtx (*cvt) (rtx, rtx);
18420 if (intmode == V4SImode)
18421 cvt = gen_floatv4siv4sf2;
18422 else
18423 cvt = gen_floatv8siv8sf2;
18424 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18425 tmp[0] = force_reg (intmode, tmp[0]);
18426 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18427 OPTAB_DIRECT);
18428 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18429 NULL_RTX, 1, OPTAB_DIRECT);
18430 tmp[3] = gen_reg_rtx (fltmode);
18431 emit_insn (cvt (tmp[3], tmp[1]));
18432 tmp[4] = gen_reg_rtx (fltmode);
18433 emit_insn (cvt (tmp[4], tmp[2]));
18434 real_ldexp (&TWO16r, &dconst1, 16);
18435 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18436 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18437 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18438 OPTAB_DIRECT);
18439 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18440 OPTAB_DIRECT);
18441 if (tmp[7] != target)
18442 emit_move_insn (target, tmp[7]);
18445 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18446 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18447 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18448 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18451 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18453 REAL_VALUE_TYPE TWO31r;
18454 rtx two31r, tmp[4];
18455 enum machine_mode mode = GET_MODE (val);
18456 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18457 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18458 rtx (*cmp) (rtx, rtx, rtx, rtx);
18459 int i;
18461 for (i = 0; i < 3; i++)
18462 tmp[i] = gen_reg_rtx (mode);
18463 real_ldexp (&TWO31r, &dconst1, 31);
18464 two31r = const_double_from_real_value (TWO31r, scalarmode);
18465 two31r = ix86_build_const_vector (mode, 1, two31r);
18466 two31r = force_reg (mode, two31r);
18467 switch (mode)
18469 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18470 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18471 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18472 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18473 default: gcc_unreachable ();
18475 tmp[3] = gen_rtx_LE (mode, two31r, val);
18476 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18477 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18478 0, OPTAB_DIRECT);
18479 if (intmode == V4SImode || TARGET_AVX2)
18480 *xorp = expand_simple_binop (intmode, ASHIFT,
18481 gen_lowpart (intmode, tmp[0]),
18482 GEN_INT (31), NULL_RTX, 0,
18483 OPTAB_DIRECT);
18484 else
18486 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18487 two31 = ix86_build_const_vector (intmode, 1, two31);
18488 *xorp = expand_simple_binop (intmode, AND,
18489 gen_lowpart (intmode, tmp[0]),
18490 two31, NULL_RTX, 0,
18491 OPTAB_DIRECT);
18493 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18494 0, OPTAB_DIRECT);
18497 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18498 then replicate the value for all elements of the vector
18499 register. */
18502 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18504 int i, n_elt;
18505 rtvec v;
18506 enum machine_mode scalar_mode;
18508 switch (mode)
18510 case V32QImode:
18511 case V16QImode:
18512 case V16HImode:
18513 case V8HImode:
18514 case V8SImode:
18515 case V4SImode:
18516 case V4DImode:
18517 case V2DImode:
18518 gcc_assert (vect);
18519 case V8SFmode:
18520 case V4SFmode:
18521 case V4DFmode:
18522 case V2DFmode:
18523 n_elt = GET_MODE_NUNITS (mode);
18524 v = rtvec_alloc (n_elt);
18525 scalar_mode = GET_MODE_INNER (mode);
18527 RTVEC_ELT (v, 0) = value;
18529 for (i = 1; i < n_elt; ++i)
18530 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18532 return gen_rtx_CONST_VECTOR (mode, v);
18534 default:
18535 gcc_unreachable ();
18539 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18540 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18541 for an SSE register. If VECT is true, then replicate the mask for
18542 all elements of the vector register. If INVERT is true, then create
18543 a mask excluding the sign bit. */
18546 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18548 enum machine_mode vec_mode, imode;
18549 HOST_WIDE_INT hi, lo;
18550 int shift = 63;
18551 rtx v;
18552 rtx mask;
18554 /* Find the sign bit, sign extended to 2*HWI. */
18555 switch (mode)
18557 case V8SImode:
18558 case V4SImode:
18559 case V8SFmode:
18560 case V4SFmode:
18561 vec_mode = mode;
18562 mode = GET_MODE_INNER (mode);
18563 imode = SImode;
18564 lo = 0x80000000, hi = lo < 0;
18565 break;
18567 case V4DImode:
18568 case V2DImode:
18569 case V4DFmode:
18570 case V2DFmode:
18571 vec_mode = mode;
18572 mode = GET_MODE_INNER (mode);
18573 imode = DImode;
18574 if (HOST_BITS_PER_WIDE_INT >= 64)
18575 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18576 else
18577 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18578 break;
18580 case TImode:
18581 case TFmode:
18582 vec_mode = VOIDmode;
18583 if (HOST_BITS_PER_WIDE_INT >= 64)
18585 imode = TImode;
18586 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18588 else
18590 rtvec vec;
18592 imode = DImode;
18593 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18595 if (invert)
18597 lo = ~lo, hi = ~hi;
18598 v = constm1_rtx;
18600 else
18601 v = const0_rtx;
18603 mask = immed_double_const (lo, hi, imode);
18605 vec = gen_rtvec (2, v, mask);
18606 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18607 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18609 return v;
18611 break;
18613 default:
18614 gcc_unreachable ();
18617 if (invert)
18618 lo = ~lo, hi = ~hi;
18620 /* Force this value into the low part of a fp vector constant. */
18621 mask = immed_double_const (lo, hi, imode);
18622 mask = gen_lowpart (mode, mask);
18624 if (vec_mode == VOIDmode)
18625 return force_reg (mode, mask);
18627 v = ix86_build_const_vector (vec_mode, vect, mask);
18628 return force_reg (vec_mode, v);
18631 /* Generate code for floating point ABS or NEG. */
18633 void
18634 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18635 rtx operands[])
18637 rtx mask, set, dst, src;
18638 bool use_sse = false;
18639 bool vector_mode = VECTOR_MODE_P (mode);
18640 enum machine_mode vmode = mode;
18642 if (vector_mode)
18643 use_sse = true;
18644 else if (mode == TFmode)
18645 use_sse = true;
18646 else if (TARGET_SSE_MATH)
18648 use_sse = SSE_FLOAT_MODE_P (mode);
18649 if (mode == SFmode)
18650 vmode = V4SFmode;
18651 else if (mode == DFmode)
18652 vmode = V2DFmode;
18655 /* NEG and ABS performed with SSE use bitwise mask operations.
18656 Create the appropriate mask now. */
18657 if (use_sse)
18658 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18659 else
18660 mask = NULL_RTX;
18662 dst = operands[0];
18663 src = operands[1];
18665 set = gen_rtx_fmt_e (code, mode, src);
18666 set = gen_rtx_SET (VOIDmode, dst, set);
18668 if (mask)
18670 rtx use, clob;
18671 rtvec par;
18673 use = gen_rtx_USE (VOIDmode, mask);
18674 if (vector_mode)
18675 par = gen_rtvec (2, set, use);
18676 else
18678 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18679 par = gen_rtvec (3, set, use, clob);
18681 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18683 else
18684 emit_insn (set);
18687 /* Expand a copysign operation. Special case operand 0 being a constant. */
18689 void
18690 ix86_expand_copysign (rtx operands[])
18692 enum machine_mode mode, vmode;
18693 rtx dest, op0, op1, mask, nmask;
18695 dest = operands[0];
18696 op0 = operands[1];
18697 op1 = operands[2];
18699 mode = GET_MODE (dest);
18701 if (mode == SFmode)
18702 vmode = V4SFmode;
18703 else if (mode == DFmode)
18704 vmode = V2DFmode;
18705 else
18706 vmode = mode;
18708 if (GET_CODE (op0) == CONST_DOUBLE)
18710 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18712 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18713 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18715 if (mode == SFmode || mode == DFmode)
18717 if (op0 == CONST0_RTX (mode))
18718 op0 = CONST0_RTX (vmode);
18719 else
18721 rtx v = ix86_build_const_vector (vmode, false, op0);
18723 op0 = force_reg (vmode, v);
18726 else if (op0 != CONST0_RTX (mode))
18727 op0 = force_reg (mode, op0);
18729 mask = ix86_build_signbit_mask (vmode, 0, 0);
18731 if (mode == SFmode)
18732 copysign_insn = gen_copysignsf3_const;
18733 else if (mode == DFmode)
18734 copysign_insn = gen_copysigndf3_const;
18735 else
18736 copysign_insn = gen_copysigntf3_const;
18738 emit_insn (copysign_insn (dest, op0, op1, mask));
18740 else
18742 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18744 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18745 mask = ix86_build_signbit_mask (vmode, 0, 0);
18747 if (mode == SFmode)
18748 copysign_insn = gen_copysignsf3_var;
18749 else if (mode == DFmode)
18750 copysign_insn = gen_copysigndf3_var;
18751 else
18752 copysign_insn = gen_copysigntf3_var;
18754 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18758 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18759 be a constant, and so has already been expanded into a vector constant. */
18761 void
18762 ix86_split_copysign_const (rtx operands[])
18764 enum machine_mode mode, vmode;
18765 rtx dest, op0, mask, x;
18767 dest = operands[0];
18768 op0 = operands[1];
18769 mask = operands[3];
18771 mode = GET_MODE (dest);
18772 vmode = GET_MODE (mask);
18774 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18775 x = gen_rtx_AND (vmode, dest, mask);
18776 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18778 if (op0 != CONST0_RTX (vmode))
18780 x = gen_rtx_IOR (vmode, dest, op0);
18781 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18785 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18786 so we have to do two masks. */
18788 void
18789 ix86_split_copysign_var (rtx operands[])
18791 enum machine_mode mode, vmode;
18792 rtx dest, scratch, op0, op1, mask, nmask, x;
18794 dest = operands[0];
18795 scratch = operands[1];
18796 op0 = operands[2];
18797 op1 = operands[3];
18798 nmask = operands[4];
18799 mask = operands[5];
18801 mode = GET_MODE (dest);
18802 vmode = GET_MODE (mask);
18804 if (rtx_equal_p (op0, op1))
18806 /* Shouldn't happen often (it's useless, obviously), but when it does
18807 we'd generate incorrect code if we continue below. */
18808 emit_move_insn (dest, op0);
18809 return;
18812 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18814 gcc_assert (REGNO (op1) == REGNO (scratch));
18816 x = gen_rtx_AND (vmode, scratch, mask);
18817 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18819 dest = mask;
18820 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18821 x = gen_rtx_NOT (vmode, dest);
18822 x = gen_rtx_AND (vmode, x, op0);
18823 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18825 else
18827 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18829 x = gen_rtx_AND (vmode, scratch, mask);
18831 else /* alternative 2,4 */
18833 gcc_assert (REGNO (mask) == REGNO (scratch));
18834 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18835 x = gen_rtx_AND (vmode, scratch, op1);
18837 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18839 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18841 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18842 x = gen_rtx_AND (vmode, dest, nmask);
18844 else /* alternative 3,4 */
18846 gcc_assert (REGNO (nmask) == REGNO (dest));
18847 dest = nmask;
18848 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18849 x = gen_rtx_AND (vmode, dest, op0);
18851 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18854 x = gen_rtx_IOR (vmode, dest, scratch);
18855 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18858 /* Return TRUE or FALSE depending on whether the first SET in INSN
18859 has source and destination with matching CC modes, and that the
18860 CC mode is at least as constrained as REQ_MODE. */
18862 bool
18863 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18865 rtx set;
18866 enum machine_mode set_mode;
18868 set = PATTERN (insn);
18869 if (GET_CODE (set) == PARALLEL)
18870 set = XVECEXP (set, 0, 0);
18871 gcc_assert (GET_CODE (set) == SET);
18872 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18874 set_mode = GET_MODE (SET_DEST (set));
18875 switch (set_mode)
18877 case CCNOmode:
18878 if (req_mode != CCNOmode
18879 && (req_mode != CCmode
18880 || XEXP (SET_SRC (set), 1) != const0_rtx))
18881 return false;
18882 break;
18883 case CCmode:
18884 if (req_mode == CCGCmode)
18885 return false;
18886 /* FALLTHRU */
18887 case CCGCmode:
18888 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18889 return false;
18890 /* FALLTHRU */
18891 case CCGOCmode:
18892 if (req_mode == CCZmode)
18893 return false;
18894 /* FALLTHRU */
18895 case CCZmode:
18896 break;
18898 case CCAmode:
18899 case CCCmode:
18900 case CCOmode:
18901 case CCSmode:
18902 if (set_mode != req_mode)
18903 return false;
18904 break;
18906 default:
18907 gcc_unreachable ();
18910 return GET_MODE (SET_SRC (set)) == set_mode;
18913 /* Generate insn patterns to do an integer compare of OPERANDS. */
18915 static rtx
18916 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18918 enum machine_mode cmpmode;
18919 rtx tmp, flags;
18921 cmpmode = SELECT_CC_MODE (code, op0, op1);
18922 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18924 /* This is very simple, but making the interface the same as in the
18925 FP case makes the rest of the code easier. */
18926 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18927 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18929 /* Return the test that should be put into the flags user, i.e.
18930 the bcc, scc, or cmov instruction. */
18931 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18934 /* Figure out whether to use ordered or unordered fp comparisons.
18935 Return the appropriate mode to use. */
18937 enum machine_mode
18938 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18940 /* ??? In order to make all comparisons reversible, we do all comparisons
18941 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18942 all forms trapping and nontrapping comparisons, we can make inequality
18943 comparisons trapping again, since it results in better code when using
18944 FCOM based compares. */
18945 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18948 enum machine_mode
18949 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18951 enum machine_mode mode = GET_MODE (op0);
18953 if (SCALAR_FLOAT_MODE_P (mode))
18955 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18956 return ix86_fp_compare_mode (code);
18959 switch (code)
18961 /* Only zero flag is needed. */
18962 case EQ: /* ZF=0 */
18963 case NE: /* ZF!=0 */
18964 return CCZmode;
18965 /* Codes needing carry flag. */
18966 case GEU: /* CF=0 */
18967 case LTU: /* CF=1 */
18968 /* Detect overflow checks. They need just the carry flag. */
18969 if (GET_CODE (op0) == PLUS
18970 && rtx_equal_p (op1, XEXP (op0, 0)))
18971 return CCCmode;
18972 else
18973 return CCmode;
18974 case GTU: /* CF=0 & ZF=0 */
18975 case LEU: /* CF=1 | ZF=1 */
18976 return CCmode;
18977 /* Codes possibly doable only with sign flag when
18978 comparing against zero. */
18979 case GE: /* SF=OF or SF=0 */
18980 case LT: /* SF<>OF or SF=1 */
18981 if (op1 == const0_rtx)
18982 return CCGOCmode;
18983 else
18984 /* For other cases Carry flag is not required. */
18985 return CCGCmode;
18986 /* Codes doable only with sign flag when comparing
18987 against zero, but we miss jump instruction for it
18988 so we need to use relational tests against overflow
18989 that thus needs to be zero. */
18990 case GT: /* ZF=0 & SF=OF */
18991 case LE: /* ZF=1 | SF<>OF */
18992 if (op1 == const0_rtx)
18993 return CCNOmode;
18994 else
18995 return CCGCmode;
18996 /* strcmp pattern do (use flags) and combine may ask us for proper
18997 mode. */
18998 case USE:
18999 return CCmode;
19000 default:
19001 gcc_unreachable ();
19005 /* Return the fixed registers used for condition codes. */
19007 static bool
19008 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19010 *p1 = FLAGS_REG;
19011 *p2 = FPSR_REG;
19012 return true;
19015 /* If two condition code modes are compatible, return a condition code
19016 mode which is compatible with both. Otherwise, return
19017 VOIDmode. */
19019 static enum machine_mode
19020 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19022 if (m1 == m2)
19023 return m1;
19025 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19026 return VOIDmode;
19028 if ((m1 == CCGCmode && m2 == CCGOCmode)
19029 || (m1 == CCGOCmode && m2 == CCGCmode))
19030 return CCGCmode;
19032 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19033 return m2;
19034 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19035 return m1;
19037 switch (m1)
19039 default:
19040 gcc_unreachable ();
19042 case CCmode:
19043 case CCGCmode:
19044 case CCGOCmode:
19045 case CCNOmode:
19046 case CCAmode:
19047 case CCCmode:
19048 case CCOmode:
19049 case CCSmode:
19050 case CCZmode:
19051 switch (m2)
19053 default:
19054 return VOIDmode;
19056 case CCmode:
19057 case CCGCmode:
19058 case CCGOCmode:
19059 case CCNOmode:
19060 case CCAmode:
19061 case CCCmode:
19062 case CCOmode:
19063 case CCSmode:
19064 case CCZmode:
19065 return CCmode;
19068 case CCFPmode:
19069 case CCFPUmode:
19070 /* These are only compatible with themselves, which we already
19071 checked above. */
19072 return VOIDmode;
19077 /* Return a comparison we can do and that it is equivalent to
19078 swap_condition (code) apart possibly from orderedness.
19079 But, never change orderedness if TARGET_IEEE_FP, returning
19080 UNKNOWN in that case if necessary. */
19082 static enum rtx_code
19083 ix86_fp_swap_condition (enum rtx_code code)
19085 switch (code)
19087 case GT: /* GTU - CF=0 & ZF=0 */
19088 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19089 case GE: /* GEU - CF=0 */
19090 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19091 case UNLT: /* LTU - CF=1 */
19092 return TARGET_IEEE_FP ? UNKNOWN : GT;
19093 case UNLE: /* LEU - CF=1 | ZF=1 */
19094 return TARGET_IEEE_FP ? UNKNOWN : GE;
19095 default:
19096 return swap_condition (code);
19100 /* Return cost of comparison CODE using the best strategy for performance.
19101 All following functions do use number of instructions as a cost metrics.
19102 In future this should be tweaked to compute bytes for optimize_size and
19103 take into account performance of various instructions on various CPUs. */
19105 static int
19106 ix86_fp_comparison_cost (enum rtx_code code)
19108 int arith_cost;
19110 /* The cost of code using bit-twiddling on %ah. */
19111 switch (code)
19113 case UNLE:
19114 case UNLT:
19115 case LTGT:
19116 case GT:
19117 case GE:
19118 case UNORDERED:
19119 case ORDERED:
19120 case UNEQ:
19121 arith_cost = 4;
19122 break;
19123 case LT:
19124 case NE:
19125 case EQ:
19126 case UNGE:
19127 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19128 break;
19129 case LE:
19130 case UNGT:
19131 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19132 break;
19133 default:
19134 gcc_unreachable ();
19137 switch (ix86_fp_comparison_strategy (code))
19139 case IX86_FPCMP_COMI:
19140 return arith_cost > 4 ? 3 : 2;
19141 case IX86_FPCMP_SAHF:
19142 return arith_cost > 4 ? 4 : 3;
19143 default:
19144 return arith_cost;
19148 /* Return strategy to use for floating-point. We assume that fcomi is always
19149 preferrable where available, since that is also true when looking at size
19150 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19152 enum ix86_fpcmp_strategy
19153 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19155 /* Do fcomi/sahf based test when profitable. */
19157 if (TARGET_CMOVE)
19158 return IX86_FPCMP_COMI;
19160 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19161 return IX86_FPCMP_SAHF;
19163 return IX86_FPCMP_ARITH;
19166 /* Swap, force into registers, or otherwise massage the two operands
19167 to a fp comparison. The operands are updated in place; the new
19168 comparison code is returned. */
19170 static enum rtx_code
19171 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19173 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19174 rtx op0 = *pop0, op1 = *pop1;
19175 enum machine_mode op_mode = GET_MODE (op0);
19176 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19178 /* All of the unordered compare instructions only work on registers.
19179 The same is true of the fcomi compare instructions. The XFmode
19180 compare instructions require registers except when comparing
19181 against zero or when converting operand 1 from fixed point to
19182 floating point. */
19184 if (!is_sse
19185 && (fpcmp_mode == CCFPUmode
19186 || (op_mode == XFmode
19187 && ! (standard_80387_constant_p (op0) == 1
19188 || standard_80387_constant_p (op1) == 1)
19189 && GET_CODE (op1) != FLOAT)
19190 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19192 op0 = force_reg (op_mode, op0);
19193 op1 = force_reg (op_mode, op1);
19195 else
19197 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19198 things around if they appear profitable, otherwise force op0
19199 into a register. */
19201 if (standard_80387_constant_p (op0) == 0
19202 || (MEM_P (op0)
19203 && ! (standard_80387_constant_p (op1) == 0
19204 || MEM_P (op1))))
19206 enum rtx_code new_code = ix86_fp_swap_condition (code);
19207 if (new_code != UNKNOWN)
19209 rtx tmp;
19210 tmp = op0, op0 = op1, op1 = tmp;
19211 code = new_code;
19215 if (!REG_P (op0))
19216 op0 = force_reg (op_mode, op0);
19218 if (CONSTANT_P (op1))
19220 int tmp = standard_80387_constant_p (op1);
19221 if (tmp == 0)
19222 op1 = validize_mem (force_const_mem (op_mode, op1));
19223 else if (tmp == 1)
19225 if (TARGET_CMOVE)
19226 op1 = force_reg (op_mode, op1);
19228 else
19229 op1 = force_reg (op_mode, op1);
19233 /* Try to rearrange the comparison to make it cheaper. */
19234 if (ix86_fp_comparison_cost (code)
19235 > ix86_fp_comparison_cost (swap_condition (code))
19236 && (REG_P (op1) || can_create_pseudo_p ()))
19238 rtx tmp;
19239 tmp = op0, op0 = op1, op1 = tmp;
19240 code = swap_condition (code);
19241 if (!REG_P (op0))
19242 op0 = force_reg (op_mode, op0);
19245 *pop0 = op0;
19246 *pop1 = op1;
19247 return code;
19250 /* Convert comparison codes we use to represent FP comparison to integer
19251 code that will result in proper branch. Return UNKNOWN if no such code
19252 is available. */
19254 enum rtx_code
19255 ix86_fp_compare_code_to_integer (enum rtx_code code)
19257 switch (code)
19259 case GT:
19260 return GTU;
19261 case GE:
19262 return GEU;
19263 case ORDERED:
19264 case UNORDERED:
19265 return code;
19266 break;
19267 case UNEQ:
19268 return EQ;
19269 break;
19270 case UNLT:
19271 return LTU;
19272 break;
19273 case UNLE:
19274 return LEU;
19275 break;
19276 case LTGT:
19277 return NE;
19278 break;
19279 default:
19280 return UNKNOWN;
19284 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19286 static rtx
19287 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19289 enum machine_mode fpcmp_mode, intcmp_mode;
19290 rtx tmp, tmp2;
19292 fpcmp_mode = ix86_fp_compare_mode (code);
19293 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19295 /* Do fcomi/sahf based test when profitable. */
19296 switch (ix86_fp_comparison_strategy (code))
19298 case IX86_FPCMP_COMI:
19299 intcmp_mode = fpcmp_mode;
19300 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19301 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19302 tmp);
19303 emit_insn (tmp);
19304 break;
19306 case IX86_FPCMP_SAHF:
19307 intcmp_mode = fpcmp_mode;
19308 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19309 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19310 tmp);
19312 if (!scratch)
19313 scratch = gen_reg_rtx (HImode);
19314 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19315 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19316 break;
19318 case IX86_FPCMP_ARITH:
19319 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19320 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19321 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19322 if (!scratch)
19323 scratch = gen_reg_rtx (HImode);
19324 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19326 /* In the unordered case, we have to check C2 for NaN's, which
19327 doesn't happen to work out to anything nice combination-wise.
19328 So do some bit twiddling on the value we've got in AH to come
19329 up with an appropriate set of condition codes. */
19331 intcmp_mode = CCNOmode;
19332 switch (code)
19334 case GT:
19335 case UNGT:
19336 if (code == GT || !TARGET_IEEE_FP)
19338 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19339 code = EQ;
19341 else
19343 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19344 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19345 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19346 intcmp_mode = CCmode;
19347 code = GEU;
19349 break;
19350 case LT:
19351 case UNLT:
19352 if (code == LT && TARGET_IEEE_FP)
19354 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19355 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19356 intcmp_mode = CCmode;
19357 code = EQ;
19359 else
19361 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19362 code = NE;
19364 break;
19365 case GE:
19366 case UNGE:
19367 if (code == GE || !TARGET_IEEE_FP)
19369 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19370 code = EQ;
19372 else
19374 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19375 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19376 code = NE;
19378 break;
19379 case LE:
19380 case UNLE:
19381 if (code == LE && TARGET_IEEE_FP)
19383 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19384 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19385 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19386 intcmp_mode = CCmode;
19387 code = LTU;
19389 else
19391 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19392 code = NE;
19394 break;
19395 case EQ:
19396 case UNEQ:
19397 if (code == EQ && TARGET_IEEE_FP)
19399 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19400 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19401 intcmp_mode = CCmode;
19402 code = EQ;
19404 else
19406 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19407 code = NE;
19409 break;
19410 case NE:
19411 case LTGT:
19412 if (code == NE && TARGET_IEEE_FP)
19414 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19415 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19416 GEN_INT (0x40)));
19417 code = NE;
19419 else
19421 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19422 code = EQ;
19424 break;
19426 case UNORDERED:
19427 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19428 code = NE;
19429 break;
19430 case ORDERED:
19431 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19432 code = EQ;
19433 break;
19435 default:
19436 gcc_unreachable ();
19438 break;
19440 default:
19441 gcc_unreachable();
19444 /* Return the test that should be put into the flags user, i.e.
19445 the bcc, scc, or cmov instruction. */
19446 return gen_rtx_fmt_ee (code, VOIDmode,
19447 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19448 const0_rtx);
19451 static rtx
19452 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19454 rtx ret;
19456 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19457 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19459 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19461 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19462 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19464 else
19465 ret = ix86_expand_int_compare (code, op0, op1);
19467 return ret;
19470 void
19471 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19473 enum machine_mode mode = GET_MODE (op0);
19474 rtx tmp;
19476 switch (mode)
19478 case SFmode:
19479 case DFmode:
19480 case XFmode:
19481 case QImode:
19482 case HImode:
19483 case SImode:
19484 simple:
19485 tmp = ix86_expand_compare (code, op0, op1);
19486 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19487 gen_rtx_LABEL_REF (VOIDmode, label),
19488 pc_rtx);
19489 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19490 return;
19492 case DImode:
19493 if (TARGET_64BIT)
19494 goto simple;
19495 case TImode:
19496 /* Expand DImode branch into multiple compare+branch. */
19498 rtx lo[2], hi[2], label2;
19499 enum rtx_code code1, code2, code3;
19500 enum machine_mode submode;
19502 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19504 tmp = op0, op0 = op1, op1 = tmp;
19505 code = swap_condition (code);
19508 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19509 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19511 submode = mode == DImode ? SImode : DImode;
19513 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19514 avoid two branches. This costs one extra insn, so disable when
19515 optimizing for size. */
19517 if ((code == EQ || code == NE)
19518 && (!optimize_insn_for_size_p ()
19519 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19521 rtx xor0, xor1;
19523 xor1 = hi[0];
19524 if (hi[1] != const0_rtx)
19525 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19526 NULL_RTX, 0, OPTAB_WIDEN);
19528 xor0 = lo[0];
19529 if (lo[1] != const0_rtx)
19530 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19531 NULL_RTX, 0, OPTAB_WIDEN);
19533 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19534 NULL_RTX, 0, OPTAB_WIDEN);
19536 ix86_expand_branch (code, tmp, const0_rtx, label);
19537 return;
19540 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19541 op1 is a constant and the low word is zero, then we can just
19542 examine the high word. Similarly for low word -1 and
19543 less-or-equal-than or greater-than. */
19545 if (CONST_INT_P (hi[1]))
19546 switch (code)
19548 case LT: case LTU: case GE: case GEU:
19549 if (lo[1] == const0_rtx)
19551 ix86_expand_branch (code, hi[0], hi[1], label);
19552 return;
19554 break;
19555 case LE: case LEU: case GT: case GTU:
19556 if (lo[1] == constm1_rtx)
19558 ix86_expand_branch (code, hi[0], hi[1], label);
19559 return;
19561 break;
19562 default:
19563 break;
19566 /* Otherwise, we need two or three jumps. */
19568 label2 = gen_label_rtx ();
19570 code1 = code;
19571 code2 = swap_condition (code);
19572 code3 = unsigned_condition (code);
19574 switch (code)
19576 case LT: case GT: case LTU: case GTU:
19577 break;
19579 case LE: code1 = LT; code2 = GT; break;
19580 case GE: code1 = GT; code2 = LT; break;
19581 case LEU: code1 = LTU; code2 = GTU; break;
19582 case GEU: code1 = GTU; code2 = LTU; break;
19584 case EQ: code1 = UNKNOWN; code2 = NE; break;
19585 case NE: code2 = UNKNOWN; break;
19587 default:
19588 gcc_unreachable ();
19592 * a < b =>
19593 * if (hi(a) < hi(b)) goto true;
19594 * if (hi(a) > hi(b)) goto false;
19595 * if (lo(a) < lo(b)) goto true;
19596 * false:
19599 if (code1 != UNKNOWN)
19600 ix86_expand_branch (code1, hi[0], hi[1], label);
19601 if (code2 != UNKNOWN)
19602 ix86_expand_branch (code2, hi[0], hi[1], label2);
19604 ix86_expand_branch (code3, lo[0], lo[1], label);
19606 if (code2 != UNKNOWN)
19607 emit_label (label2);
19608 return;
19611 default:
19612 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19613 goto simple;
19617 /* Split branch based on floating point condition. */
19618 void
19619 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19620 rtx target1, rtx target2, rtx tmp, rtx pushed)
19622 rtx condition;
19623 rtx i;
19625 if (target2 != pc_rtx)
19627 rtx tmp = target2;
19628 code = reverse_condition_maybe_unordered (code);
19629 target2 = target1;
19630 target1 = tmp;
19633 condition = ix86_expand_fp_compare (code, op1, op2,
19634 tmp);
19636 /* Remove pushed operand from stack. */
19637 if (pushed)
19638 ix86_free_from_memory (GET_MODE (pushed));
19640 i = emit_jump_insn (gen_rtx_SET
19641 (VOIDmode, pc_rtx,
19642 gen_rtx_IF_THEN_ELSE (VOIDmode,
19643 condition, target1, target2)));
19644 if (split_branch_probability >= 0)
19645 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19648 void
19649 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19651 rtx ret;
19653 gcc_assert (GET_MODE (dest) == QImode);
19655 ret = ix86_expand_compare (code, op0, op1);
19656 PUT_MODE (ret, QImode);
19657 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19660 /* Expand comparison setting or clearing carry flag. Return true when
19661 successful and set pop for the operation. */
19662 static bool
19663 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19665 enum machine_mode mode =
19666 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19668 /* Do not handle double-mode compares that go through special path. */
19669 if (mode == (TARGET_64BIT ? TImode : DImode))
19670 return false;
19672 if (SCALAR_FLOAT_MODE_P (mode))
19674 rtx compare_op, compare_seq;
19676 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19678 /* Shortcut: following common codes never translate
19679 into carry flag compares. */
19680 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19681 || code == ORDERED || code == UNORDERED)
19682 return false;
19684 /* These comparisons require zero flag; swap operands so they won't. */
19685 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19686 && !TARGET_IEEE_FP)
19688 rtx tmp = op0;
19689 op0 = op1;
19690 op1 = tmp;
19691 code = swap_condition (code);
19694 /* Try to expand the comparison and verify that we end up with
19695 carry flag based comparison. This fails to be true only when
19696 we decide to expand comparison using arithmetic that is not
19697 too common scenario. */
19698 start_sequence ();
19699 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19700 compare_seq = get_insns ();
19701 end_sequence ();
19703 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19704 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19705 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19706 else
19707 code = GET_CODE (compare_op);
19709 if (code != LTU && code != GEU)
19710 return false;
19712 emit_insn (compare_seq);
19713 *pop = compare_op;
19714 return true;
19717 if (!INTEGRAL_MODE_P (mode))
19718 return false;
19720 switch (code)
19722 case LTU:
19723 case GEU:
19724 break;
19726 /* Convert a==0 into (unsigned)a<1. */
19727 case EQ:
19728 case NE:
19729 if (op1 != const0_rtx)
19730 return false;
19731 op1 = const1_rtx;
19732 code = (code == EQ ? LTU : GEU);
19733 break;
19735 /* Convert a>b into b<a or a>=b-1. */
19736 case GTU:
19737 case LEU:
19738 if (CONST_INT_P (op1))
19740 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19741 /* Bail out on overflow. We still can swap operands but that
19742 would force loading of the constant into register. */
19743 if (op1 == const0_rtx
19744 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19745 return false;
19746 code = (code == GTU ? GEU : LTU);
19748 else
19750 rtx tmp = op1;
19751 op1 = op0;
19752 op0 = tmp;
19753 code = (code == GTU ? LTU : GEU);
19755 break;
19757 /* Convert a>=0 into (unsigned)a<0x80000000. */
19758 case LT:
19759 case GE:
19760 if (mode == DImode || op1 != const0_rtx)
19761 return false;
19762 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19763 code = (code == LT ? GEU : LTU);
19764 break;
19765 case LE:
19766 case GT:
19767 if (mode == DImode || op1 != constm1_rtx)
19768 return false;
19769 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19770 code = (code == LE ? GEU : LTU);
19771 break;
19773 default:
19774 return false;
19776 /* Swapping operands may cause constant to appear as first operand. */
19777 if (!nonimmediate_operand (op0, VOIDmode))
19779 if (!can_create_pseudo_p ())
19780 return false;
19781 op0 = force_reg (mode, op0);
19783 *pop = ix86_expand_compare (code, op0, op1);
19784 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19785 return true;
19788 bool
19789 ix86_expand_int_movcc (rtx operands[])
19791 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19792 rtx compare_seq, compare_op;
19793 enum machine_mode mode = GET_MODE (operands[0]);
19794 bool sign_bit_compare_p = false;
19795 rtx op0 = XEXP (operands[1], 0);
19796 rtx op1 = XEXP (operands[1], 1);
19798 if (GET_MODE (op0) == TImode
19799 || (GET_MODE (op0) == DImode
19800 && !TARGET_64BIT))
19801 return false;
19803 start_sequence ();
19804 compare_op = ix86_expand_compare (code, op0, op1);
19805 compare_seq = get_insns ();
19806 end_sequence ();
19808 compare_code = GET_CODE (compare_op);
19810 if ((op1 == const0_rtx && (code == GE || code == LT))
19811 || (op1 == constm1_rtx && (code == GT || code == LE)))
19812 sign_bit_compare_p = true;
19814 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19815 HImode insns, we'd be swallowed in word prefix ops. */
19817 if ((mode != HImode || TARGET_FAST_PREFIX)
19818 && (mode != (TARGET_64BIT ? TImode : DImode))
19819 && CONST_INT_P (operands[2])
19820 && CONST_INT_P (operands[3]))
19822 rtx out = operands[0];
19823 HOST_WIDE_INT ct = INTVAL (operands[2]);
19824 HOST_WIDE_INT cf = INTVAL (operands[3]);
19825 HOST_WIDE_INT diff;
19827 diff = ct - cf;
19828 /* Sign bit compares are better done using shifts than we do by using
19829 sbb. */
19830 if (sign_bit_compare_p
19831 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19833 /* Detect overlap between destination and compare sources. */
19834 rtx tmp = out;
19836 if (!sign_bit_compare_p)
19838 rtx flags;
19839 bool fpcmp = false;
19841 compare_code = GET_CODE (compare_op);
19843 flags = XEXP (compare_op, 0);
19845 if (GET_MODE (flags) == CCFPmode
19846 || GET_MODE (flags) == CCFPUmode)
19848 fpcmp = true;
19849 compare_code
19850 = ix86_fp_compare_code_to_integer (compare_code);
19853 /* To simplify rest of code, restrict to the GEU case. */
19854 if (compare_code == LTU)
19856 HOST_WIDE_INT tmp = ct;
19857 ct = cf;
19858 cf = tmp;
19859 compare_code = reverse_condition (compare_code);
19860 code = reverse_condition (code);
19862 else
19864 if (fpcmp)
19865 PUT_CODE (compare_op,
19866 reverse_condition_maybe_unordered
19867 (GET_CODE (compare_op)));
19868 else
19869 PUT_CODE (compare_op,
19870 reverse_condition (GET_CODE (compare_op)));
19872 diff = ct - cf;
19874 if (reg_overlap_mentioned_p (out, op0)
19875 || reg_overlap_mentioned_p (out, op1))
19876 tmp = gen_reg_rtx (mode);
19878 if (mode == DImode)
19879 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19880 else
19881 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19882 flags, compare_op));
19884 else
19886 if (code == GT || code == GE)
19887 code = reverse_condition (code);
19888 else
19890 HOST_WIDE_INT tmp = ct;
19891 ct = cf;
19892 cf = tmp;
19893 diff = ct - cf;
19895 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19898 if (diff == 1)
19901 * cmpl op0,op1
19902 * sbbl dest,dest
19903 * [addl dest, ct]
19905 * Size 5 - 8.
19907 if (ct)
19908 tmp = expand_simple_binop (mode, PLUS,
19909 tmp, GEN_INT (ct),
19910 copy_rtx (tmp), 1, OPTAB_DIRECT);
19912 else if (cf == -1)
19915 * cmpl op0,op1
19916 * sbbl dest,dest
19917 * orl $ct, dest
19919 * Size 8.
19921 tmp = expand_simple_binop (mode, IOR,
19922 tmp, GEN_INT (ct),
19923 copy_rtx (tmp), 1, OPTAB_DIRECT);
19925 else if (diff == -1 && ct)
19928 * cmpl op0,op1
19929 * sbbl dest,dest
19930 * notl dest
19931 * [addl dest, cf]
19933 * Size 8 - 11.
19935 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19936 if (cf)
19937 tmp = expand_simple_binop (mode, PLUS,
19938 copy_rtx (tmp), GEN_INT (cf),
19939 copy_rtx (tmp), 1, OPTAB_DIRECT);
19941 else
19944 * cmpl op0,op1
19945 * sbbl dest,dest
19946 * [notl dest]
19947 * andl cf - ct, dest
19948 * [addl dest, ct]
19950 * Size 8 - 11.
19953 if (cf == 0)
19955 cf = ct;
19956 ct = 0;
19957 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19960 tmp = expand_simple_binop (mode, AND,
19961 copy_rtx (tmp),
19962 gen_int_mode (cf - ct, mode),
19963 copy_rtx (tmp), 1, OPTAB_DIRECT);
19964 if (ct)
19965 tmp = expand_simple_binop (mode, PLUS,
19966 copy_rtx (tmp), GEN_INT (ct),
19967 copy_rtx (tmp), 1, OPTAB_DIRECT);
19970 if (!rtx_equal_p (tmp, out))
19971 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19973 return true;
19976 if (diff < 0)
19978 enum machine_mode cmp_mode = GET_MODE (op0);
19980 HOST_WIDE_INT tmp;
19981 tmp = ct, ct = cf, cf = tmp;
19982 diff = -diff;
19984 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19986 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19988 /* We may be reversing unordered compare to normal compare, that
19989 is not valid in general (we may convert non-trapping condition
19990 to trapping one), however on i386 we currently emit all
19991 comparisons unordered. */
19992 compare_code = reverse_condition_maybe_unordered (compare_code);
19993 code = reverse_condition_maybe_unordered (code);
19995 else
19997 compare_code = reverse_condition (compare_code);
19998 code = reverse_condition (code);
20002 compare_code = UNKNOWN;
20003 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20004 && CONST_INT_P (op1))
20006 if (op1 == const0_rtx
20007 && (code == LT || code == GE))
20008 compare_code = code;
20009 else if (op1 == constm1_rtx)
20011 if (code == LE)
20012 compare_code = LT;
20013 else if (code == GT)
20014 compare_code = GE;
20018 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20019 if (compare_code != UNKNOWN
20020 && GET_MODE (op0) == GET_MODE (out)
20021 && (cf == -1 || ct == -1))
20023 /* If lea code below could be used, only optimize
20024 if it results in a 2 insn sequence. */
20026 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20027 || diff == 3 || diff == 5 || diff == 9)
20028 || (compare_code == LT && ct == -1)
20029 || (compare_code == GE && cf == -1))
20032 * notl op1 (if necessary)
20033 * sarl $31, op1
20034 * orl cf, op1
20036 if (ct != -1)
20038 cf = ct;
20039 ct = -1;
20040 code = reverse_condition (code);
20043 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20045 out = expand_simple_binop (mode, IOR,
20046 out, GEN_INT (cf),
20047 out, 1, OPTAB_DIRECT);
20048 if (out != operands[0])
20049 emit_move_insn (operands[0], out);
20051 return true;
20056 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20057 || diff == 3 || diff == 5 || diff == 9)
20058 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20059 && (mode != DImode
20060 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20063 * xorl dest,dest
20064 * cmpl op1,op2
20065 * setcc dest
20066 * lea cf(dest*(ct-cf)),dest
20068 * Size 14.
20070 * This also catches the degenerate setcc-only case.
20073 rtx tmp;
20074 int nops;
20076 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20078 nops = 0;
20079 /* On x86_64 the lea instruction operates on Pmode, so we need
20080 to get arithmetics done in proper mode to match. */
20081 if (diff == 1)
20082 tmp = copy_rtx (out);
20083 else
20085 rtx out1;
20086 out1 = copy_rtx (out);
20087 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20088 nops++;
20089 if (diff & 1)
20091 tmp = gen_rtx_PLUS (mode, tmp, out1);
20092 nops++;
20095 if (cf != 0)
20097 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20098 nops++;
20100 if (!rtx_equal_p (tmp, out))
20102 if (nops == 1)
20103 out = force_operand (tmp, copy_rtx (out));
20104 else
20105 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20107 if (!rtx_equal_p (out, operands[0]))
20108 emit_move_insn (operands[0], copy_rtx (out));
20110 return true;
20114 * General case: Jumpful:
20115 * xorl dest,dest cmpl op1, op2
20116 * cmpl op1, op2 movl ct, dest
20117 * setcc dest jcc 1f
20118 * decl dest movl cf, dest
20119 * andl (cf-ct),dest 1:
20120 * addl ct,dest
20122 * Size 20. Size 14.
20124 * This is reasonably steep, but branch mispredict costs are
20125 * high on modern cpus, so consider failing only if optimizing
20126 * for space.
20129 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20130 && BRANCH_COST (optimize_insn_for_speed_p (),
20131 false) >= 2)
20133 if (cf == 0)
20135 enum machine_mode cmp_mode = GET_MODE (op0);
20137 cf = ct;
20138 ct = 0;
20140 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20142 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20144 /* We may be reversing unordered compare to normal compare,
20145 that is not valid in general (we may convert non-trapping
20146 condition to trapping one), however on i386 we currently
20147 emit all comparisons unordered. */
20148 code = reverse_condition_maybe_unordered (code);
20150 else
20152 code = reverse_condition (code);
20153 if (compare_code != UNKNOWN)
20154 compare_code = reverse_condition (compare_code);
20158 if (compare_code != UNKNOWN)
20160 /* notl op1 (if needed)
20161 sarl $31, op1
20162 andl (cf-ct), op1
20163 addl ct, op1
20165 For x < 0 (resp. x <= -1) there will be no notl,
20166 so if possible swap the constants to get rid of the
20167 complement.
20168 True/false will be -1/0 while code below (store flag
20169 followed by decrement) is 0/-1, so the constants need
20170 to be exchanged once more. */
20172 if (compare_code == GE || !cf)
20174 code = reverse_condition (code);
20175 compare_code = LT;
20177 else
20179 HOST_WIDE_INT tmp = cf;
20180 cf = ct;
20181 ct = tmp;
20184 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20186 else
20188 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20190 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20191 constm1_rtx,
20192 copy_rtx (out), 1, OPTAB_DIRECT);
20195 out = expand_simple_binop (mode, AND, copy_rtx (out),
20196 gen_int_mode (cf - ct, mode),
20197 copy_rtx (out), 1, OPTAB_DIRECT);
20198 if (ct)
20199 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20200 copy_rtx (out), 1, OPTAB_DIRECT);
20201 if (!rtx_equal_p (out, operands[0]))
20202 emit_move_insn (operands[0], copy_rtx (out));
20204 return true;
20208 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20210 /* Try a few things more with specific constants and a variable. */
20212 optab op;
20213 rtx var, orig_out, out, tmp;
20215 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20216 return false;
20218 /* If one of the two operands is an interesting constant, load a
20219 constant with the above and mask it in with a logical operation. */
20221 if (CONST_INT_P (operands[2]))
20223 var = operands[3];
20224 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20225 operands[3] = constm1_rtx, op = and_optab;
20226 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20227 operands[3] = const0_rtx, op = ior_optab;
20228 else
20229 return false;
20231 else if (CONST_INT_P (operands[3]))
20233 var = operands[2];
20234 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20235 operands[2] = constm1_rtx, op = and_optab;
20236 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20237 operands[2] = const0_rtx, op = ior_optab;
20238 else
20239 return false;
20241 else
20242 return false;
20244 orig_out = operands[0];
20245 tmp = gen_reg_rtx (mode);
20246 operands[0] = tmp;
20248 /* Recurse to get the constant loaded. */
20249 if (ix86_expand_int_movcc (operands) == 0)
20250 return false;
20252 /* Mask in the interesting variable. */
20253 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20254 OPTAB_WIDEN);
20255 if (!rtx_equal_p (out, orig_out))
20256 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20258 return true;
20262 * For comparison with above,
20264 * movl cf,dest
20265 * movl ct,tmp
20266 * cmpl op1,op2
20267 * cmovcc tmp,dest
20269 * Size 15.
20272 if (! nonimmediate_operand (operands[2], mode))
20273 operands[2] = force_reg (mode, operands[2]);
20274 if (! nonimmediate_operand (operands[3], mode))
20275 operands[3] = force_reg (mode, operands[3]);
20277 if (! register_operand (operands[2], VOIDmode)
20278 && (mode == QImode
20279 || ! register_operand (operands[3], VOIDmode)))
20280 operands[2] = force_reg (mode, operands[2]);
20282 if (mode == QImode
20283 && ! register_operand (operands[3], VOIDmode))
20284 operands[3] = force_reg (mode, operands[3]);
20286 emit_insn (compare_seq);
20287 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20288 gen_rtx_IF_THEN_ELSE (mode,
20289 compare_op, operands[2],
20290 operands[3])));
20291 return true;
20294 /* Swap, force into registers, or otherwise massage the two operands
20295 to an sse comparison with a mask result. Thus we differ a bit from
20296 ix86_prepare_fp_compare_args which expects to produce a flags result.
20298 The DEST operand exists to help determine whether to commute commutative
20299 operators. The POP0/POP1 operands are updated in place. The new
20300 comparison code is returned, or UNKNOWN if not implementable. */
20302 static enum rtx_code
20303 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20304 rtx *pop0, rtx *pop1)
20306 rtx tmp;
20308 switch (code)
20310 case LTGT:
20311 case UNEQ:
20312 /* AVX supports all the needed comparisons. */
20313 if (TARGET_AVX)
20314 break;
20315 /* We have no LTGT as an operator. We could implement it with
20316 NE & ORDERED, but this requires an extra temporary. It's
20317 not clear that it's worth it. */
20318 return UNKNOWN;
20320 case LT:
20321 case LE:
20322 case UNGT:
20323 case UNGE:
20324 /* These are supported directly. */
20325 break;
20327 case EQ:
20328 case NE:
20329 case UNORDERED:
20330 case ORDERED:
20331 /* AVX has 3 operand comparisons, no need to swap anything. */
20332 if (TARGET_AVX)
20333 break;
20334 /* For commutative operators, try to canonicalize the destination
20335 operand to be first in the comparison - this helps reload to
20336 avoid extra moves. */
20337 if (!dest || !rtx_equal_p (dest, *pop1))
20338 break;
20339 /* FALLTHRU */
20341 case GE:
20342 case GT:
20343 case UNLE:
20344 case UNLT:
20345 /* These are not supported directly before AVX, and furthermore
20346 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20347 comparison operands to transform into something that is
20348 supported. */
20349 tmp = *pop0;
20350 *pop0 = *pop1;
20351 *pop1 = tmp;
20352 code = swap_condition (code);
20353 break;
20355 default:
20356 gcc_unreachable ();
20359 return code;
20362 /* Detect conditional moves that exactly match min/max operational
20363 semantics. Note that this is IEEE safe, as long as we don't
20364 interchange the operands.
20366 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20367 and TRUE if the operation is successful and instructions are emitted. */
20369 static bool
20370 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20371 rtx cmp_op1, rtx if_true, rtx if_false)
20373 enum machine_mode mode;
20374 bool is_min;
20375 rtx tmp;
20377 if (code == LT)
20379 else if (code == UNGE)
20381 tmp = if_true;
20382 if_true = if_false;
20383 if_false = tmp;
20385 else
20386 return false;
20388 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20389 is_min = true;
20390 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20391 is_min = false;
20392 else
20393 return false;
20395 mode = GET_MODE (dest);
20397 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20398 but MODE may be a vector mode and thus not appropriate. */
20399 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20401 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20402 rtvec v;
20404 if_true = force_reg (mode, if_true);
20405 v = gen_rtvec (2, if_true, if_false);
20406 tmp = gen_rtx_UNSPEC (mode, v, u);
20408 else
20410 code = is_min ? SMIN : SMAX;
20411 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20414 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20415 return true;
20418 /* Expand an sse vector comparison. Return the register with the result. */
20420 static rtx
20421 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20422 rtx op_true, rtx op_false)
20424 enum machine_mode mode = GET_MODE (dest);
20425 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20426 rtx x;
20428 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20429 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20430 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20432 if (optimize
20433 || reg_overlap_mentioned_p (dest, op_true)
20434 || reg_overlap_mentioned_p (dest, op_false))
20435 dest = gen_reg_rtx (mode);
20437 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20438 if (cmp_mode != mode)
20440 x = force_reg (cmp_mode, x);
20441 convert_move (dest, x, false);
20443 else
20444 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20446 return dest;
20449 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20450 operations. This is used for both scalar and vector conditional moves. */
20452 static void
20453 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20455 enum machine_mode mode = GET_MODE (dest);
20456 rtx t2, t3, x;
20458 if (vector_all_ones_operand (op_true, mode)
20459 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20461 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20463 else if (op_false == CONST0_RTX (mode))
20465 op_true = force_reg (mode, op_true);
20466 x = gen_rtx_AND (mode, cmp, op_true);
20467 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20469 else if (op_true == CONST0_RTX (mode))
20471 op_false = force_reg (mode, op_false);
20472 x = gen_rtx_NOT (mode, cmp);
20473 x = gen_rtx_AND (mode, x, op_false);
20474 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20476 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20478 op_false = force_reg (mode, op_false);
20479 x = gen_rtx_IOR (mode, cmp, op_false);
20480 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20482 else if (TARGET_XOP)
20484 op_true = force_reg (mode, op_true);
20486 if (!nonimmediate_operand (op_false, mode))
20487 op_false = force_reg (mode, op_false);
20489 emit_insn (gen_rtx_SET (mode, dest,
20490 gen_rtx_IF_THEN_ELSE (mode, cmp,
20491 op_true,
20492 op_false)));
20494 else
20496 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20497 rtx d = dest;
20499 if (!nonimmediate_operand (op_true, mode))
20500 op_true = force_reg (mode, op_true);
20502 op_false = force_reg (mode, op_false);
20504 switch (mode)
20506 case V4SFmode:
20507 if (TARGET_SSE4_1)
20508 gen = gen_sse4_1_blendvps;
20509 break;
20510 case V2DFmode:
20511 if (TARGET_SSE4_1)
20512 gen = gen_sse4_1_blendvpd;
20513 break;
20514 case V16QImode:
20515 case V8HImode:
20516 case V4SImode:
20517 case V2DImode:
20518 if (TARGET_SSE4_1)
20520 gen = gen_sse4_1_pblendvb;
20521 if (mode != V16QImode)
20522 d = gen_reg_rtx (V16QImode);
20523 op_false = gen_lowpart (V16QImode, op_false);
20524 op_true = gen_lowpart (V16QImode, op_true);
20525 cmp = gen_lowpart (V16QImode, cmp);
20527 break;
20528 case V8SFmode:
20529 if (TARGET_AVX)
20530 gen = gen_avx_blendvps256;
20531 break;
20532 case V4DFmode:
20533 if (TARGET_AVX)
20534 gen = gen_avx_blendvpd256;
20535 break;
20536 case V32QImode:
20537 case V16HImode:
20538 case V8SImode:
20539 case V4DImode:
20540 if (TARGET_AVX2)
20542 gen = gen_avx2_pblendvb;
20543 if (mode != V32QImode)
20544 d = gen_reg_rtx (V32QImode);
20545 op_false = gen_lowpart (V32QImode, op_false);
20546 op_true = gen_lowpart (V32QImode, op_true);
20547 cmp = gen_lowpart (V32QImode, cmp);
20549 break;
20550 default:
20551 break;
20554 if (gen != NULL)
20556 emit_insn (gen (d, op_false, op_true, cmp));
20557 if (d != dest)
20558 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20560 else
20562 op_true = force_reg (mode, op_true);
20564 t2 = gen_reg_rtx (mode);
20565 if (optimize)
20566 t3 = gen_reg_rtx (mode);
20567 else
20568 t3 = dest;
20570 x = gen_rtx_AND (mode, op_true, cmp);
20571 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20573 x = gen_rtx_NOT (mode, cmp);
20574 x = gen_rtx_AND (mode, x, op_false);
20575 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20577 x = gen_rtx_IOR (mode, t3, t2);
20578 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20583 /* Expand a floating-point conditional move. Return true if successful. */
20585 bool
20586 ix86_expand_fp_movcc (rtx operands[])
20588 enum machine_mode mode = GET_MODE (operands[0]);
20589 enum rtx_code code = GET_CODE (operands[1]);
20590 rtx tmp, compare_op;
20591 rtx op0 = XEXP (operands[1], 0);
20592 rtx op1 = XEXP (operands[1], 1);
20594 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20596 enum machine_mode cmode;
20598 /* Since we've no cmove for sse registers, don't force bad register
20599 allocation just to gain access to it. Deny movcc when the
20600 comparison mode doesn't match the move mode. */
20601 cmode = GET_MODE (op0);
20602 if (cmode == VOIDmode)
20603 cmode = GET_MODE (op1);
20604 if (cmode != mode)
20605 return false;
20607 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20608 if (code == UNKNOWN)
20609 return false;
20611 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20612 operands[2], operands[3]))
20613 return true;
20615 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20616 operands[2], operands[3]);
20617 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20618 return true;
20621 if (GET_MODE (op0) == TImode
20622 || (GET_MODE (op0) == DImode
20623 && !TARGET_64BIT))
20624 return false;
20626 /* The floating point conditional move instructions don't directly
20627 support conditions resulting from a signed integer comparison. */
20629 compare_op = ix86_expand_compare (code, op0, op1);
20630 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20632 tmp = gen_reg_rtx (QImode);
20633 ix86_expand_setcc (tmp, code, op0, op1);
20635 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20638 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20639 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20640 operands[2], operands[3])));
20642 return true;
20645 /* Expand a floating-point vector conditional move; a vcond operation
20646 rather than a movcc operation. */
20648 bool
20649 ix86_expand_fp_vcond (rtx operands[])
20651 enum rtx_code code = GET_CODE (operands[3]);
20652 rtx cmp;
20654 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20655 &operands[4], &operands[5]);
20656 if (code == UNKNOWN)
20658 rtx temp;
20659 switch (GET_CODE (operands[3]))
20661 case LTGT:
20662 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20663 operands[5], operands[0], operands[0]);
20664 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20665 operands[5], operands[1], operands[2]);
20666 code = AND;
20667 break;
20668 case UNEQ:
20669 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20670 operands[5], operands[0], operands[0]);
20671 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20672 operands[5], operands[1], operands[2]);
20673 code = IOR;
20674 break;
20675 default:
20676 gcc_unreachable ();
20678 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20679 OPTAB_DIRECT);
20680 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20681 return true;
20684 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20685 operands[5], operands[1], operands[2]))
20686 return true;
20688 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20689 operands[1], operands[2]);
20690 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20691 return true;
20694 /* Expand a signed/unsigned integral vector conditional move. */
20696 bool
20697 ix86_expand_int_vcond (rtx operands[])
20699 enum machine_mode data_mode = GET_MODE (operands[0]);
20700 enum machine_mode mode = GET_MODE (operands[4]);
20701 enum rtx_code code = GET_CODE (operands[3]);
20702 bool negate = false;
20703 rtx x, cop0, cop1;
20705 cop0 = operands[4];
20706 cop1 = operands[5];
20708 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20709 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20710 if ((code == LT || code == GE)
20711 && data_mode == mode
20712 && cop1 == CONST0_RTX (mode)
20713 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20714 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20715 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20716 && (GET_MODE_SIZE (data_mode) == 16
20717 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20719 rtx negop = operands[2 - (code == LT)];
20720 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20721 if (negop == CONST1_RTX (data_mode))
20723 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20724 operands[0], 1, OPTAB_DIRECT);
20725 if (res != operands[0])
20726 emit_move_insn (operands[0], res);
20727 return true;
20729 else if (GET_MODE_INNER (data_mode) != DImode
20730 && vector_all_ones_operand (negop, data_mode))
20732 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20733 operands[0], 0, OPTAB_DIRECT);
20734 if (res != operands[0])
20735 emit_move_insn (operands[0], res);
20736 return true;
20740 if (!nonimmediate_operand (cop1, mode))
20741 cop1 = force_reg (mode, cop1);
20742 if (!general_operand (operands[1], data_mode))
20743 operands[1] = force_reg (data_mode, operands[1]);
20744 if (!general_operand (operands[2], data_mode))
20745 operands[2] = force_reg (data_mode, operands[2]);
20747 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20748 if (TARGET_XOP
20749 && (mode == V16QImode || mode == V8HImode
20750 || mode == V4SImode || mode == V2DImode))
20752 else
20754 /* Canonicalize the comparison to EQ, GT, GTU. */
20755 switch (code)
20757 case EQ:
20758 case GT:
20759 case GTU:
20760 break;
20762 case NE:
20763 case LE:
20764 case LEU:
20765 code = reverse_condition (code);
20766 negate = true;
20767 break;
20769 case GE:
20770 case GEU:
20771 code = reverse_condition (code);
20772 negate = true;
20773 /* FALLTHRU */
20775 case LT:
20776 case LTU:
20777 code = swap_condition (code);
20778 x = cop0, cop0 = cop1, cop1 = x;
20779 break;
20781 default:
20782 gcc_unreachable ();
20785 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20786 if (mode == V2DImode)
20788 switch (code)
20790 case EQ:
20791 /* SSE4.1 supports EQ. */
20792 if (!TARGET_SSE4_1)
20793 return false;
20794 break;
20796 case GT:
20797 case GTU:
20798 /* SSE4.2 supports GT/GTU. */
20799 if (!TARGET_SSE4_2)
20800 return false;
20801 break;
20803 default:
20804 gcc_unreachable ();
20808 /* Unsigned parallel compare is not supported by the hardware.
20809 Play some tricks to turn this into a signed comparison
20810 against 0. */
20811 if (code == GTU)
20813 cop0 = force_reg (mode, cop0);
20815 switch (mode)
20817 case V8SImode:
20818 case V4DImode:
20819 case V4SImode:
20820 case V2DImode:
20822 rtx t1, t2, mask;
20823 rtx (*gen_sub3) (rtx, rtx, rtx);
20825 switch (mode)
20827 case V8SImode: gen_sub3 = gen_subv8si3; break;
20828 case V4DImode: gen_sub3 = gen_subv4di3; break;
20829 case V4SImode: gen_sub3 = gen_subv4si3; break;
20830 case V2DImode: gen_sub3 = gen_subv2di3; break;
20831 default:
20832 gcc_unreachable ();
20834 /* Subtract (-(INT MAX) - 1) from both operands to make
20835 them signed. */
20836 mask = ix86_build_signbit_mask (mode, true, false);
20837 t1 = gen_reg_rtx (mode);
20838 emit_insn (gen_sub3 (t1, cop0, mask));
20840 t2 = gen_reg_rtx (mode);
20841 emit_insn (gen_sub3 (t2, cop1, mask));
20843 cop0 = t1;
20844 cop1 = t2;
20845 code = GT;
20847 break;
20849 case V32QImode:
20850 case V16HImode:
20851 case V16QImode:
20852 case V8HImode:
20853 /* Perform a parallel unsigned saturating subtraction. */
20854 x = gen_reg_rtx (mode);
20855 emit_insn (gen_rtx_SET (VOIDmode, x,
20856 gen_rtx_US_MINUS (mode, cop0, cop1)));
20858 cop0 = x;
20859 cop1 = CONST0_RTX (mode);
20860 code = EQ;
20861 negate = !negate;
20862 break;
20864 default:
20865 gcc_unreachable ();
20870 /* Allow the comparison to be done in one mode, but the movcc to
20871 happen in another mode. */
20872 if (data_mode == mode)
20874 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20875 operands[1+negate], operands[2-negate]);
20877 else
20879 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20880 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
20881 operands[1+negate], operands[2-negate]);
20882 x = gen_lowpart (data_mode, x);
20885 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20886 operands[2-negate]);
20887 return true;
20890 /* Expand a variable vector permutation. */
20892 void
20893 ix86_expand_vec_perm (rtx operands[])
20895 rtx target = operands[0];
20896 rtx op0 = operands[1];
20897 rtx op1 = operands[2];
20898 rtx mask = operands[3];
20899 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
20900 enum machine_mode mode = GET_MODE (op0);
20901 enum machine_mode maskmode = GET_MODE (mask);
20902 int w, e, i;
20903 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20905 /* Number of elements in the vector. */
20906 w = GET_MODE_NUNITS (mode);
20907 e = GET_MODE_UNIT_SIZE (mode);
20908 gcc_assert (w <= 32);
20910 if (TARGET_AVX2)
20912 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20914 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20915 an constant shuffle operand. With a tiny bit of effort we can
20916 use VPERMD instead. A re-interpretation stall for V4DFmode is
20917 unfortunate but there's no avoiding it.
20918 Similarly for V16HImode we don't have instructions for variable
20919 shuffling, while for V32QImode we can use after preparing suitable
20920 masks vpshufb; vpshufb; vpermq; vpor. */
20922 if (mode == V16HImode)
20924 maskmode = mode = V32QImode;
20925 w = 32;
20926 e = 1;
20928 else
20930 maskmode = mode = V8SImode;
20931 w = 8;
20932 e = 4;
20934 t1 = gen_reg_rtx (maskmode);
20936 /* Replicate the low bits of the V4DImode mask into V8SImode:
20937 mask = { A B C D }
20938 t1 = { A A B B C C D D }. */
20939 for (i = 0; i < w / 2; ++i)
20940 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20941 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20942 vt = force_reg (maskmode, vt);
20943 mask = gen_lowpart (maskmode, mask);
20944 if (maskmode == V8SImode)
20945 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20946 else
20947 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20949 /* Multiply the shuffle indicies by two. */
20950 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20951 OPTAB_DIRECT);
20953 /* Add one to the odd shuffle indicies:
20954 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20955 for (i = 0; i < w / 2; ++i)
20957 vec[i * 2] = const0_rtx;
20958 vec[i * 2 + 1] = const1_rtx;
20960 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20961 vt = validize_mem (force_const_mem (maskmode, vt));
20962 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20963 OPTAB_DIRECT);
20965 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20966 operands[3] = mask = t1;
20967 target = gen_reg_rtx (mode);
20968 op0 = gen_lowpart (mode, op0);
20969 op1 = gen_lowpart (mode, op1);
20972 switch (mode)
20974 case V8SImode:
20975 /* The VPERMD and VPERMPS instructions already properly ignore
20976 the high bits of the shuffle elements. No need for us to
20977 perform an AND ourselves. */
20978 if (one_operand_shuffle)
20980 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20981 if (target != operands[0])
20982 emit_move_insn (operands[0],
20983 gen_lowpart (GET_MODE (operands[0]), target));
20985 else
20987 t1 = gen_reg_rtx (V8SImode);
20988 t2 = gen_reg_rtx (V8SImode);
20989 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20990 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20991 goto merge_two;
20993 return;
20995 case V8SFmode:
20996 mask = gen_lowpart (V8SFmode, mask);
20997 if (one_operand_shuffle)
20998 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20999 else
21001 t1 = gen_reg_rtx (V8SFmode);
21002 t2 = gen_reg_rtx (V8SFmode);
21003 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21004 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21005 goto merge_two;
21007 return;
21009 case V4SImode:
21010 /* By combining the two 128-bit input vectors into one 256-bit
21011 input vector, we can use VPERMD and VPERMPS for the full
21012 two-operand shuffle. */
21013 t1 = gen_reg_rtx (V8SImode);
21014 t2 = gen_reg_rtx (V8SImode);
21015 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21016 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21017 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21018 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21019 return;
21021 case V4SFmode:
21022 t1 = gen_reg_rtx (V8SFmode);
21023 t2 = gen_reg_rtx (V8SImode);
21024 mask = gen_lowpart (V4SImode, mask);
21025 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21026 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21027 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21028 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21029 return;
21031 case V32QImode:
21032 t1 = gen_reg_rtx (V32QImode);
21033 t2 = gen_reg_rtx (V32QImode);
21034 t3 = gen_reg_rtx (V32QImode);
21035 vt2 = GEN_INT (128);
21036 for (i = 0; i < 32; i++)
21037 vec[i] = vt2;
21038 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21039 vt = force_reg (V32QImode, vt);
21040 for (i = 0; i < 32; i++)
21041 vec[i] = i < 16 ? vt2 : const0_rtx;
21042 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21043 vt2 = force_reg (V32QImode, vt2);
21044 /* From mask create two adjusted masks, which contain the same
21045 bits as mask in the low 7 bits of each vector element.
21046 The first mask will have the most significant bit clear
21047 if it requests element from the same 128-bit lane
21048 and MSB set if it requests element from the other 128-bit lane.
21049 The second mask will have the opposite values of the MSB,
21050 and additionally will have its 128-bit lanes swapped.
21051 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21052 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21053 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21054 stands for other 12 bytes. */
21055 /* The bit whether element is from the same lane or the other
21056 lane is bit 4, so shift it up by 3 to the MSB position. */
21057 t5 = gen_reg_rtx (V4DImode);
21058 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21059 GEN_INT (3)));
21060 /* Clear MSB bits from the mask just in case it had them set. */
21061 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21062 /* After this t1 will have MSB set for elements from other lane. */
21063 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21064 /* Clear bits other than MSB. */
21065 emit_insn (gen_andv32qi3 (t1, t1, vt));
21066 /* Or in the lower bits from mask into t3. */
21067 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21068 /* And invert MSB bits in t1, so MSB is set for elements from the same
21069 lane. */
21070 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21071 /* Swap 128-bit lanes in t3. */
21072 t6 = gen_reg_rtx (V4DImode);
21073 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21074 const2_rtx, GEN_INT (3),
21075 const0_rtx, const1_rtx));
21076 /* And or in the lower bits from mask into t1. */
21077 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21078 if (one_operand_shuffle)
21080 /* Each of these shuffles will put 0s in places where
21081 element from the other 128-bit lane is needed, otherwise
21082 will shuffle in the requested value. */
21083 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21084 gen_lowpart (V32QImode, t6)));
21085 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21086 /* For t3 the 128-bit lanes are swapped again. */
21087 t7 = gen_reg_rtx (V4DImode);
21088 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21089 const2_rtx, GEN_INT (3),
21090 const0_rtx, const1_rtx));
21091 /* And oring both together leads to the result. */
21092 emit_insn (gen_iorv32qi3 (target, t1,
21093 gen_lowpart (V32QImode, t7)));
21094 if (target != operands[0])
21095 emit_move_insn (operands[0],
21096 gen_lowpart (GET_MODE (operands[0]), target));
21097 return;
21100 t4 = gen_reg_rtx (V32QImode);
21101 /* Similarly to the above one_operand_shuffle code,
21102 just for repeated twice for each operand. merge_two:
21103 code will merge the two results together. */
21104 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21105 gen_lowpart (V32QImode, t6)));
21106 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21107 gen_lowpart (V32QImode, t6)));
21108 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21109 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21110 t7 = gen_reg_rtx (V4DImode);
21111 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21112 const2_rtx, GEN_INT (3),
21113 const0_rtx, const1_rtx));
21114 t8 = gen_reg_rtx (V4DImode);
21115 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21116 const2_rtx, GEN_INT (3),
21117 const0_rtx, const1_rtx));
21118 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21119 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21120 t1 = t4;
21121 t2 = t3;
21122 goto merge_two;
21124 default:
21125 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21126 break;
21130 if (TARGET_XOP)
21132 /* The XOP VPPERM insn supports three inputs. By ignoring the
21133 one_operand_shuffle special case, we avoid creating another
21134 set of constant vectors in memory. */
21135 one_operand_shuffle = false;
21137 /* mask = mask & {2*w-1, ...} */
21138 vt = GEN_INT (2*w - 1);
21140 else
21142 /* mask = mask & {w-1, ...} */
21143 vt = GEN_INT (w - 1);
21146 for (i = 0; i < w; i++)
21147 vec[i] = vt;
21148 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21149 mask = expand_simple_binop (maskmode, AND, mask, vt,
21150 NULL_RTX, 0, OPTAB_DIRECT);
21152 /* For non-QImode operations, convert the word permutation control
21153 into a byte permutation control. */
21154 if (mode != V16QImode)
21156 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21157 GEN_INT (exact_log2 (e)),
21158 NULL_RTX, 0, OPTAB_DIRECT);
21160 /* Convert mask to vector of chars. */
21161 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21163 /* Replicate each of the input bytes into byte positions:
21164 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21165 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21166 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21167 for (i = 0; i < 16; ++i)
21168 vec[i] = GEN_INT (i/e * e);
21169 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21170 vt = validize_mem (force_const_mem (V16QImode, vt));
21171 if (TARGET_XOP)
21172 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21173 else
21174 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21176 /* Convert it into the byte positions by doing
21177 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21178 for (i = 0; i < 16; ++i)
21179 vec[i] = GEN_INT (i % e);
21180 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21181 vt = validize_mem (force_const_mem (V16QImode, vt));
21182 emit_insn (gen_addv16qi3 (mask, mask, vt));
21185 /* The actual shuffle operations all operate on V16QImode. */
21186 op0 = gen_lowpart (V16QImode, op0);
21187 op1 = gen_lowpart (V16QImode, op1);
21189 if (TARGET_XOP)
21191 if (GET_MODE (target) != V16QImode)
21192 target = gen_reg_rtx (V16QImode);
21193 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21194 if (target != operands[0])
21195 emit_move_insn (operands[0],
21196 gen_lowpart (GET_MODE (operands[0]), target));
21198 else if (one_operand_shuffle)
21200 if (GET_MODE (target) != V16QImode)
21201 target = gen_reg_rtx (V16QImode);
21202 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21203 if (target != operands[0])
21204 emit_move_insn (operands[0],
21205 gen_lowpart (GET_MODE (operands[0]), target));
21207 else
21209 rtx xops[6];
21210 bool ok;
21212 /* Shuffle the two input vectors independently. */
21213 t1 = gen_reg_rtx (V16QImode);
21214 t2 = gen_reg_rtx (V16QImode);
21215 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21216 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21218 merge_two:
21219 /* Then merge them together. The key is whether any given control
21220 element contained a bit set that indicates the second word. */
21221 mask = operands[3];
21222 vt = GEN_INT (w);
21223 if (maskmode == V2DImode && !TARGET_SSE4_1)
21225 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21226 more shuffle to convert the V2DI input mask into a V4SI
21227 input mask. At which point the masking that expand_int_vcond
21228 will work as desired. */
21229 rtx t3 = gen_reg_rtx (V4SImode);
21230 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21231 const0_rtx, const0_rtx,
21232 const2_rtx, const2_rtx));
21233 mask = t3;
21234 maskmode = V4SImode;
21235 e = w = 4;
21238 for (i = 0; i < w; i++)
21239 vec[i] = vt;
21240 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21241 vt = force_reg (maskmode, vt);
21242 mask = expand_simple_binop (maskmode, AND, mask, vt,
21243 NULL_RTX, 0, OPTAB_DIRECT);
21245 if (GET_MODE (target) != mode)
21246 target = gen_reg_rtx (mode);
21247 xops[0] = target;
21248 xops[1] = gen_lowpart (mode, t2);
21249 xops[2] = gen_lowpart (mode, t1);
21250 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21251 xops[4] = mask;
21252 xops[5] = vt;
21253 ok = ix86_expand_int_vcond (xops);
21254 gcc_assert (ok);
21255 if (target != operands[0])
21256 emit_move_insn (operands[0],
21257 gen_lowpart (GET_MODE (operands[0]), target));
21261 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21262 true if we should do zero extension, else sign extension. HIGH_P is
21263 true if we want the N/2 high elements, else the low elements. */
21265 void
21266 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21268 enum machine_mode imode = GET_MODE (src);
21269 rtx tmp;
21271 if (TARGET_SSE4_1)
21273 rtx (*unpack)(rtx, rtx);
21274 rtx (*extract)(rtx, rtx) = NULL;
21275 enum machine_mode halfmode = BLKmode;
21277 switch (imode)
21279 case V32QImode:
21280 if (unsigned_p)
21281 unpack = gen_avx2_zero_extendv16qiv16hi2;
21282 else
21283 unpack = gen_avx2_sign_extendv16qiv16hi2;
21284 halfmode = V16QImode;
21285 extract
21286 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21287 break;
21288 case V16HImode:
21289 if (unsigned_p)
21290 unpack = gen_avx2_zero_extendv8hiv8si2;
21291 else
21292 unpack = gen_avx2_sign_extendv8hiv8si2;
21293 halfmode = V8HImode;
21294 extract
21295 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21296 break;
21297 case V8SImode:
21298 if (unsigned_p)
21299 unpack = gen_avx2_zero_extendv4siv4di2;
21300 else
21301 unpack = gen_avx2_sign_extendv4siv4di2;
21302 halfmode = V4SImode;
21303 extract
21304 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21305 break;
21306 case V16QImode:
21307 if (unsigned_p)
21308 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21309 else
21310 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21311 break;
21312 case V8HImode:
21313 if (unsigned_p)
21314 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21315 else
21316 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21317 break;
21318 case V4SImode:
21319 if (unsigned_p)
21320 unpack = gen_sse4_1_zero_extendv2siv2di2;
21321 else
21322 unpack = gen_sse4_1_sign_extendv2siv2di2;
21323 break;
21324 default:
21325 gcc_unreachable ();
21328 if (GET_MODE_SIZE (imode) == 32)
21330 tmp = gen_reg_rtx (halfmode);
21331 emit_insn (extract (tmp, src));
21333 else if (high_p)
21335 /* Shift higher 8 bytes to lower 8 bytes. */
21336 tmp = gen_reg_rtx (V1TImode);
21337 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21338 GEN_INT (64)));
21339 tmp = gen_lowpart (imode, tmp);
21341 else
21342 tmp = src;
21344 emit_insn (unpack (dest, tmp));
21346 else
21348 rtx (*unpack)(rtx, rtx, rtx);
21350 switch (imode)
21352 case V16QImode:
21353 if (high_p)
21354 unpack = gen_vec_interleave_highv16qi;
21355 else
21356 unpack = gen_vec_interleave_lowv16qi;
21357 break;
21358 case V8HImode:
21359 if (high_p)
21360 unpack = gen_vec_interleave_highv8hi;
21361 else
21362 unpack = gen_vec_interleave_lowv8hi;
21363 break;
21364 case V4SImode:
21365 if (high_p)
21366 unpack = gen_vec_interleave_highv4si;
21367 else
21368 unpack = gen_vec_interleave_lowv4si;
21369 break;
21370 default:
21371 gcc_unreachable ();
21374 if (unsigned_p)
21375 tmp = force_reg (imode, CONST0_RTX (imode));
21376 else
21377 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21378 src, pc_rtx, pc_rtx);
21380 rtx tmp2 = gen_reg_rtx (imode);
21381 emit_insn (unpack (tmp2, src, tmp));
21382 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21386 /* Expand conditional increment or decrement using adb/sbb instructions.
21387 The default case using setcc followed by the conditional move can be
21388 done by generic code. */
21389 bool
21390 ix86_expand_int_addcc (rtx operands[])
21392 enum rtx_code code = GET_CODE (operands[1]);
21393 rtx flags;
21394 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21395 rtx compare_op;
21396 rtx val = const0_rtx;
21397 bool fpcmp = false;
21398 enum machine_mode mode;
21399 rtx op0 = XEXP (operands[1], 0);
21400 rtx op1 = XEXP (operands[1], 1);
21402 if (operands[3] != const1_rtx
21403 && operands[3] != constm1_rtx)
21404 return false;
21405 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21406 return false;
21407 code = GET_CODE (compare_op);
21409 flags = XEXP (compare_op, 0);
21411 if (GET_MODE (flags) == CCFPmode
21412 || GET_MODE (flags) == CCFPUmode)
21414 fpcmp = true;
21415 code = ix86_fp_compare_code_to_integer (code);
21418 if (code != LTU)
21420 val = constm1_rtx;
21421 if (fpcmp)
21422 PUT_CODE (compare_op,
21423 reverse_condition_maybe_unordered
21424 (GET_CODE (compare_op)));
21425 else
21426 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21429 mode = GET_MODE (operands[0]);
21431 /* Construct either adc or sbb insn. */
21432 if ((code == LTU) == (operands[3] == constm1_rtx))
21434 switch (mode)
21436 case QImode:
21437 insn = gen_subqi3_carry;
21438 break;
21439 case HImode:
21440 insn = gen_subhi3_carry;
21441 break;
21442 case SImode:
21443 insn = gen_subsi3_carry;
21444 break;
21445 case DImode:
21446 insn = gen_subdi3_carry;
21447 break;
21448 default:
21449 gcc_unreachable ();
21452 else
21454 switch (mode)
21456 case QImode:
21457 insn = gen_addqi3_carry;
21458 break;
21459 case HImode:
21460 insn = gen_addhi3_carry;
21461 break;
21462 case SImode:
21463 insn = gen_addsi3_carry;
21464 break;
21465 case DImode:
21466 insn = gen_adddi3_carry;
21467 break;
21468 default:
21469 gcc_unreachable ();
21472 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21474 return true;
21478 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21479 but works for floating pointer parameters and nonoffsetable memories.
21480 For pushes, it returns just stack offsets; the values will be saved
21481 in the right order. Maximally three parts are generated. */
21483 static int
21484 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21486 int size;
21488 if (!TARGET_64BIT)
21489 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21490 else
21491 size = (GET_MODE_SIZE (mode) + 4) / 8;
21493 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21494 gcc_assert (size >= 2 && size <= 4);
21496 /* Optimize constant pool reference to immediates. This is used by fp
21497 moves, that force all constants to memory to allow combining. */
21498 if (MEM_P (operand) && MEM_READONLY_P (operand))
21500 rtx tmp = maybe_get_pool_constant (operand);
21501 if (tmp)
21502 operand = tmp;
21505 if (MEM_P (operand) && !offsettable_memref_p (operand))
21507 /* The only non-offsetable memories we handle are pushes. */
21508 int ok = push_operand (operand, VOIDmode);
21510 gcc_assert (ok);
21512 operand = copy_rtx (operand);
21513 PUT_MODE (operand, word_mode);
21514 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21515 return size;
21518 if (GET_CODE (operand) == CONST_VECTOR)
21520 enum machine_mode imode = int_mode_for_mode (mode);
21521 /* Caution: if we looked through a constant pool memory above,
21522 the operand may actually have a different mode now. That's
21523 ok, since we want to pun this all the way back to an integer. */
21524 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21525 gcc_assert (operand != NULL);
21526 mode = imode;
21529 if (!TARGET_64BIT)
21531 if (mode == DImode)
21532 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21533 else
21535 int i;
21537 if (REG_P (operand))
21539 gcc_assert (reload_completed);
21540 for (i = 0; i < size; i++)
21541 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21543 else if (offsettable_memref_p (operand))
21545 operand = adjust_address (operand, SImode, 0);
21546 parts[0] = operand;
21547 for (i = 1; i < size; i++)
21548 parts[i] = adjust_address (operand, SImode, 4 * i);
21550 else if (GET_CODE (operand) == CONST_DOUBLE)
21552 REAL_VALUE_TYPE r;
21553 long l[4];
21555 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21556 switch (mode)
21558 case TFmode:
21559 real_to_target (l, &r, mode);
21560 parts[3] = gen_int_mode (l[3], SImode);
21561 parts[2] = gen_int_mode (l[2], SImode);
21562 break;
21563 case XFmode:
21564 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21565 long double may not be 80-bit. */
21566 real_to_target (l, &r, mode);
21567 parts[2] = gen_int_mode (l[2], SImode);
21568 break;
21569 case DFmode:
21570 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21571 break;
21572 default:
21573 gcc_unreachable ();
21575 parts[1] = gen_int_mode (l[1], SImode);
21576 parts[0] = gen_int_mode (l[0], SImode);
21578 else
21579 gcc_unreachable ();
21582 else
21584 if (mode == TImode)
21585 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21586 if (mode == XFmode || mode == TFmode)
21588 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21589 if (REG_P (operand))
21591 gcc_assert (reload_completed);
21592 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21593 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21595 else if (offsettable_memref_p (operand))
21597 operand = adjust_address (operand, DImode, 0);
21598 parts[0] = operand;
21599 parts[1] = adjust_address (operand, upper_mode, 8);
21601 else if (GET_CODE (operand) == CONST_DOUBLE)
21603 REAL_VALUE_TYPE r;
21604 long l[4];
21606 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21607 real_to_target (l, &r, mode);
21609 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21610 if (HOST_BITS_PER_WIDE_INT >= 64)
21611 parts[0]
21612 = gen_int_mode
21613 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21614 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21615 DImode);
21616 else
21617 parts[0] = immed_double_const (l[0], l[1], DImode);
21619 if (upper_mode == SImode)
21620 parts[1] = gen_int_mode (l[2], SImode);
21621 else if (HOST_BITS_PER_WIDE_INT >= 64)
21622 parts[1]
21623 = gen_int_mode
21624 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21625 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21626 DImode);
21627 else
21628 parts[1] = immed_double_const (l[2], l[3], DImode);
21630 else
21631 gcc_unreachable ();
21635 return size;
21638 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21639 Return false when normal moves are needed; true when all required
21640 insns have been emitted. Operands 2-4 contain the input values
21641 int the correct order; operands 5-7 contain the output values. */
21643 void
21644 ix86_split_long_move (rtx operands[])
21646 rtx part[2][4];
21647 int nparts, i, j;
21648 int push = 0;
21649 int collisions = 0;
21650 enum machine_mode mode = GET_MODE (operands[0]);
21651 bool collisionparts[4];
21653 /* The DFmode expanders may ask us to move double.
21654 For 64bit target this is single move. By hiding the fact
21655 here we simplify i386.md splitters. */
21656 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21658 /* Optimize constant pool reference to immediates. This is used by
21659 fp moves, that force all constants to memory to allow combining. */
21661 if (MEM_P (operands[1])
21662 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21663 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21664 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21665 if (push_operand (operands[0], VOIDmode))
21667 operands[0] = copy_rtx (operands[0]);
21668 PUT_MODE (operands[0], word_mode);
21670 else
21671 operands[0] = gen_lowpart (DImode, operands[0]);
21672 operands[1] = gen_lowpart (DImode, operands[1]);
21673 emit_move_insn (operands[0], operands[1]);
21674 return;
21677 /* The only non-offsettable memory we handle is push. */
21678 if (push_operand (operands[0], VOIDmode))
21679 push = 1;
21680 else
21681 gcc_assert (!MEM_P (operands[0])
21682 || offsettable_memref_p (operands[0]));
21684 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21685 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21687 /* When emitting push, take care for source operands on the stack. */
21688 if (push && MEM_P (operands[1])
21689 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21691 rtx src_base = XEXP (part[1][nparts - 1], 0);
21693 /* Compensate for the stack decrement by 4. */
21694 if (!TARGET_64BIT && nparts == 3
21695 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21696 src_base = plus_constant (Pmode, src_base, 4);
21698 /* src_base refers to the stack pointer and is
21699 automatically decreased by emitted push. */
21700 for (i = 0; i < nparts; i++)
21701 part[1][i] = change_address (part[1][i],
21702 GET_MODE (part[1][i]), src_base);
21705 /* We need to do copy in the right order in case an address register
21706 of the source overlaps the destination. */
21707 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21709 rtx tmp;
21711 for (i = 0; i < nparts; i++)
21713 collisionparts[i]
21714 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21715 if (collisionparts[i])
21716 collisions++;
21719 /* Collision in the middle part can be handled by reordering. */
21720 if (collisions == 1 && nparts == 3 && collisionparts [1])
21722 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21723 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21725 else if (collisions == 1
21726 && nparts == 4
21727 && (collisionparts [1] || collisionparts [2]))
21729 if (collisionparts [1])
21731 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21732 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21734 else
21736 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21737 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21741 /* If there are more collisions, we can't handle it by reordering.
21742 Do an lea to the last part and use only one colliding move. */
21743 else if (collisions > 1)
21745 rtx base;
21747 collisions = 1;
21749 base = part[0][nparts - 1];
21751 /* Handle the case when the last part isn't valid for lea.
21752 Happens in 64-bit mode storing the 12-byte XFmode. */
21753 if (GET_MODE (base) != Pmode)
21754 base = gen_rtx_REG (Pmode, REGNO (base));
21756 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21757 part[1][0] = replace_equiv_address (part[1][0], base);
21758 for (i = 1; i < nparts; i++)
21760 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21761 part[1][i] = replace_equiv_address (part[1][i], tmp);
21766 if (push)
21768 if (!TARGET_64BIT)
21770 if (nparts == 3)
21772 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21773 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21774 stack_pointer_rtx, GEN_INT (-4)));
21775 emit_move_insn (part[0][2], part[1][2]);
21777 else if (nparts == 4)
21779 emit_move_insn (part[0][3], part[1][3]);
21780 emit_move_insn (part[0][2], part[1][2]);
21783 else
21785 /* In 64bit mode we don't have 32bit push available. In case this is
21786 register, it is OK - we will just use larger counterpart. We also
21787 retype memory - these comes from attempt to avoid REX prefix on
21788 moving of second half of TFmode value. */
21789 if (GET_MODE (part[1][1]) == SImode)
21791 switch (GET_CODE (part[1][1]))
21793 case MEM:
21794 part[1][1] = adjust_address (part[1][1], DImode, 0);
21795 break;
21797 case REG:
21798 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21799 break;
21801 default:
21802 gcc_unreachable ();
21805 if (GET_MODE (part[1][0]) == SImode)
21806 part[1][0] = part[1][1];
21809 emit_move_insn (part[0][1], part[1][1]);
21810 emit_move_insn (part[0][0], part[1][0]);
21811 return;
21814 /* Choose correct order to not overwrite the source before it is copied. */
21815 if ((REG_P (part[0][0])
21816 && REG_P (part[1][1])
21817 && (REGNO (part[0][0]) == REGNO (part[1][1])
21818 || (nparts == 3
21819 && REGNO (part[0][0]) == REGNO (part[1][2]))
21820 || (nparts == 4
21821 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21822 || (collisions > 0
21823 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21825 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21827 operands[2 + i] = part[0][j];
21828 operands[6 + i] = part[1][j];
21831 else
21833 for (i = 0; i < nparts; i++)
21835 operands[2 + i] = part[0][i];
21836 operands[6 + i] = part[1][i];
21840 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21841 if (optimize_insn_for_size_p ())
21843 for (j = 0; j < nparts - 1; j++)
21844 if (CONST_INT_P (operands[6 + j])
21845 && operands[6 + j] != const0_rtx
21846 && REG_P (operands[2 + j]))
21847 for (i = j; i < nparts - 1; i++)
21848 if (CONST_INT_P (operands[7 + i])
21849 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21850 operands[7 + i] = operands[2 + j];
21853 for (i = 0; i < nparts; i++)
21854 emit_move_insn (operands[2 + i], operands[6 + i]);
21856 return;
21859 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21860 left shift by a constant, either using a single shift or
21861 a sequence of add instructions. */
21863 static void
21864 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21866 rtx (*insn)(rtx, rtx, rtx);
21868 if (count == 1
21869 || (count * ix86_cost->add <= ix86_cost->shift_const
21870 && !optimize_insn_for_size_p ()))
21872 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21873 while (count-- > 0)
21874 emit_insn (insn (operand, operand, operand));
21876 else
21878 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21879 emit_insn (insn (operand, operand, GEN_INT (count)));
21883 void
21884 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21886 rtx (*gen_ashl3)(rtx, rtx, rtx);
21887 rtx (*gen_shld)(rtx, rtx, rtx);
21888 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21890 rtx low[2], high[2];
21891 int count;
21893 if (CONST_INT_P (operands[2]))
21895 split_double_mode (mode, operands, 2, low, high);
21896 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21898 if (count >= half_width)
21900 emit_move_insn (high[0], low[1]);
21901 emit_move_insn (low[0], const0_rtx);
21903 if (count > half_width)
21904 ix86_expand_ashl_const (high[0], count - half_width, mode);
21906 else
21908 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21910 if (!rtx_equal_p (operands[0], operands[1]))
21911 emit_move_insn (operands[0], operands[1]);
21913 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21914 ix86_expand_ashl_const (low[0], count, mode);
21916 return;
21919 split_double_mode (mode, operands, 1, low, high);
21921 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21923 if (operands[1] == const1_rtx)
21925 /* Assuming we've chosen a QImode capable registers, then 1 << N
21926 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21927 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21929 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21931 ix86_expand_clear (low[0]);
21932 ix86_expand_clear (high[0]);
21933 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21935 d = gen_lowpart (QImode, low[0]);
21936 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21937 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21938 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21940 d = gen_lowpart (QImode, high[0]);
21941 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21942 s = gen_rtx_NE (QImode, flags, const0_rtx);
21943 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21946 /* Otherwise, we can get the same results by manually performing
21947 a bit extract operation on bit 5/6, and then performing the two
21948 shifts. The two methods of getting 0/1 into low/high are exactly
21949 the same size. Avoiding the shift in the bit extract case helps
21950 pentium4 a bit; no one else seems to care much either way. */
21951 else
21953 enum machine_mode half_mode;
21954 rtx (*gen_lshr3)(rtx, rtx, rtx);
21955 rtx (*gen_and3)(rtx, rtx, rtx);
21956 rtx (*gen_xor3)(rtx, rtx, rtx);
21957 HOST_WIDE_INT bits;
21958 rtx x;
21960 if (mode == DImode)
21962 half_mode = SImode;
21963 gen_lshr3 = gen_lshrsi3;
21964 gen_and3 = gen_andsi3;
21965 gen_xor3 = gen_xorsi3;
21966 bits = 5;
21968 else
21970 half_mode = DImode;
21971 gen_lshr3 = gen_lshrdi3;
21972 gen_and3 = gen_anddi3;
21973 gen_xor3 = gen_xordi3;
21974 bits = 6;
21977 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21978 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21979 else
21980 x = gen_lowpart (half_mode, operands[2]);
21981 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21983 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21984 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21985 emit_move_insn (low[0], high[0]);
21986 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21989 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21990 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21991 return;
21994 if (operands[1] == constm1_rtx)
21996 /* For -1 << N, we can avoid the shld instruction, because we
21997 know that we're shifting 0...31/63 ones into a -1. */
21998 emit_move_insn (low[0], constm1_rtx);
21999 if (optimize_insn_for_size_p ())
22000 emit_move_insn (high[0], low[0]);
22001 else
22002 emit_move_insn (high[0], constm1_rtx);
22004 else
22006 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22008 if (!rtx_equal_p (operands[0], operands[1]))
22009 emit_move_insn (operands[0], operands[1]);
22011 split_double_mode (mode, operands, 1, low, high);
22012 emit_insn (gen_shld (high[0], low[0], operands[2]));
22015 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22017 if (TARGET_CMOVE && scratch)
22019 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22020 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22022 ix86_expand_clear (scratch);
22023 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22025 else
22027 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22028 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22030 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22034 void
22035 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22037 rtx (*gen_ashr3)(rtx, rtx, rtx)
22038 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22039 rtx (*gen_shrd)(rtx, rtx, rtx);
22040 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22042 rtx low[2], high[2];
22043 int count;
22045 if (CONST_INT_P (operands[2]))
22047 split_double_mode (mode, operands, 2, low, high);
22048 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22050 if (count == GET_MODE_BITSIZE (mode) - 1)
22052 emit_move_insn (high[0], high[1]);
22053 emit_insn (gen_ashr3 (high[0], high[0],
22054 GEN_INT (half_width - 1)));
22055 emit_move_insn (low[0], high[0]);
22058 else if (count >= half_width)
22060 emit_move_insn (low[0], high[1]);
22061 emit_move_insn (high[0], low[0]);
22062 emit_insn (gen_ashr3 (high[0], high[0],
22063 GEN_INT (half_width - 1)));
22065 if (count > half_width)
22066 emit_insn (gen_ashr3 (low[0], low[0],
22067 GEN_INT (count - half_width)));
22069 else
22071 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22073 if (!rtx_equal_p (operands[0], operands[1]))
22074 emit_move_insn (operands[0], operands[1]);
22076 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22077 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22080 else
22082 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22084 if (!rtx_equal_p (operands[0], operands[1]))
22085 emit_move_insn (operands[0], operands[1]);
22087 split_double_mode (mode, operands, 1, low, high);
22089 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22090 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22092 if (TARGET_CMOVE && scratch)
22094 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22095 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22097 emit_move_insn (scratch, high[0]);
22098 emit_insn (gen_ashr3 (scratch, scratch,
22099 GEN_INT (half_width - 1)));
22100 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22101 scratch));
22103 else
22105 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22106 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22108 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22113 void
22114 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22116 rtx (*gen_lshr3)(rtx, rtx, rtx)
22117 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22118 rtx (*gen_shrd)(rtx, rtx, rtx);
22119 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22121 rtx low[2], high[2];
22122 int count;
22124 if (CONST_INT_P (operands[2]))
22126 split_double_mode (mode, operands, 2, low, high);
22127 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22129 if (count >= half_width)
22131 emit_move_insn (low[0], high[1]);
22132 ix86_expand_clear (high[0]);
22134 if (count > half_width)
22135 emit_insn (gen_lshr3 (low[0], low[0],
22136 GEN_INT (count - half_width)));
22138 else
22140 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22142 if (!rtx_equal_p (operands[0], operands[1]))
22143 emit_move_insn (operands[0], operands[1]);
22145 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22146 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22149 else
22151 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22153 if (!rtx_equal_p (operands[0], operands[1]))
22154 emit_move_insn (operands[0], operands[1]);
22156 split_double_mode (mode, operands, 1, low, high);
22158 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22159 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22161 if (TARGET_CMOVE && scratch)
22163 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22164 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22166 ix86_expand_clear (scratch);
22167 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22168 scratch));
22170 else
22172 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22173 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22175 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22180 /* Predict just emitted jump instruction to be taken with probability PROB. */
22181 static void
22182 predict_jump (int prob)
22184 rtx insn = get_last_insn ();
22185 gcc_assert (JUMP_P (insn));
22186 add_int_reg_note (insn, REG_BR_PROB, prob);
22189 /* Helper function for the string operations below. Dest VARIABLE whether
22190 it is aligned to VALUE bytes. If true, jump to the label. */
22191 static rtx
22192 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22194 rtx label = gen_label_rtx ();
22195 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22196 if (GET_MODE (variable) == DImode)
22197 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22198 else
22199 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22200 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22201 1, label);
22202 if (epilogue)
22203 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22204 else
22205 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22206 return label;
22209 /* Adjust COUNTER by the VALUE. */
22210 static void
22211 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22213 rtx (*gen_add)(rtx, rtx, rtx)
22214 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22216 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22219 /* Zero extend possibly SImode EXP to Pmode register. */
22221 ix86_zero_extend_to_Pmode (rtx exp)
22223 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22226 /* Divide COUNTREG by SCALE. */
22227 static rtx
22228 scale_counter (rtx countreg, int scale)
22230 rtx sc;
22232 if (scale == 1)
22233 return countreg;
22234 if (CONST_INT_P (countreg))
22235 return GEN_INT (INTVAL (countreg) / scale);
22236 gcc_assert (REG_P (countreg));
22238 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22239 GEN_INT (exact_log2 (scale)),
22240 NULL, 1, OPTAB_DIRECT);
22241 return sc;
22244 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22245 DImode for constant loop counts. */
22247 static enum machine_mode
22248 counter_mode (rtx count_exp)
22250 if (GET_MODE (count_exp) != VOIDmode)
22251 return GET_MODE (count_exp);
22252 if (!CONST_INT_P (count_exp))
22253 return Pmode;
22254 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22255 return DImode;
22256 return SImode;
22259 /* Copy the address to a Pmode register. This is used for x32 to
22260 truncate DImode TLS address to a SImode register. */
22262 static rtx
22263 ix86_copy_addr_to_reg (rtx addr)
22265 if (GET_MODE (addr) == Pmode)
22266 return copy_addr_to_reg (addr);
22267 else
22269 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22270 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22274 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22275 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22276 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22277 memory by VALUE (supposed to be in MODE).
22279 The size is rounded down to whole number of chunk size moved at once.
22280 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22283 static void
22284 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22285 rtx destptr, rtx srcptr, rtx value,
22286 rtx count, enum machine_mode mode, int unroll,
22287 int expected_size, bool issetmem)
22289 rtx out_label, top_label, iter, tmp;
22290 enum machine_mode iter_mode = counter_mode (count);
22291 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22292 rtx piece_size = GEN_INT (piece_size_n);
22293 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22294 rtx size;
22295 int i;
22297 top_label = gen_label_rtx ();
22298 out_label = gen_label_rtx ();
22299 iter = gen_reg_rtx (iter_mode);
22301 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22302 NULL, 1, OPTAB_DIRECT);
22303 /* Those two should combine. */
22304 if (piece_size == const1_rtx)
22306 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22307 true, out_label);
22308 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22310 emit_move_insn (iter, const0_rtx);
22312 emit_label (top_label);
22314 tmp = convert_modes (Pmode, iter_mode, iter, true);
22316 /* This assert could be relaxed - in this case we'll need to compute
22317 smallest power of two, containing in PIECE_SIZE_N and pass it to
22318 offset_address. */
22319 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22320 destmem = offset_address (destmem, tmp, piece_size_n);
22321 destmem = adjust_address (destmem, mode, 0);
22323 if (!issetmem)
22325 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22326 srcmem = adjust_address (srcmem, mode, 0);
22328 /* When unrolling for chips that reorder memory reads and writes,
22329 we can save registers by using single temporary.
22330 Also using 4 temporaries is overkill in 32bit mode. */
22331 if (!TARGET_64BIT && 0)
22333 for (i = 0; i < unroll; i++)
22335 if (i)
22337 destmem =
22338 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22339 srcmem =
22340 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22342 emit_move_insn (destmem, srcmem);
22345 else
22347 rtx tmpreg[4];
22348 gcc_assert (unroll <= 4);
22349 for (i = 0; i < unroll; i++)
22351 tmpreg[i] = gen_reg_rtx (mode);
22352 if (i)
22354 srcmem =
22355 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22357 emit_move_insn (tmpreg[i], srcmem);
22359 for (i = 0; i < unroll; i++)
22361 if (i)
22363 destmem =
22364 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22366 emit_move_insn (destmem, tmpreg[i]);
22370 else
22371 for (i = 0; i < unroll; i++)
22373 if (i)
22374 destmem =
22375 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22376 emit_move_insn (destmem, value);
22379 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22380 true, OPTAB_LIB_WIDEN);
22381 if (tmp != iter)
22382 emit_move_insn (iter, tmp);
22384 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22385 true, top_label);
22386 if (expected_size != -1)
22388 expected_size /= GET_MODE_SIZE (mode) * unroll;
22389 if (expected_size == 0)
22390 predict_jump (0);
22391 else if (expected_size > REG_BR_PROB_BASE)
22392 predict_jump (REG_BR_PROB_BASE - 1);
22393 else
22394 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22396 else
22397 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22398 iter = ix86_zero_extend_to_Pmode (iter);
22399 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22400 true, OPTAB_LIB_WIDEN);
22401 if (tmp != destptr)
22402 emit_move_insn (destptr, tmp);
22403 if (!issetmem)
22405 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22406 true, OPTAB_LIB_WIDEN);
22407 if (tmp != srcptr)
22408 emit_move_insn (srcptr, tmp);
22410 emit_label (out_label);
22413 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22414 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22415 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22416 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22417 ORIG_VALUE is the original value passed to memset to fill the memory with.
22418 Other arguments have same meaning as for previous function. */
22420 static void
22421 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22422 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22423 rtx count,
22424 enum machine_mode mode, bool issetmem)
22426 rtx destexp;
22427 rtx srcexp;
22428 rtx countreg;
22429 HOST_WIDE_INT rounded_count;
22431 /* If possible, it is shorter to use rep movs.
22432 TODO: Maybe it is better to move this logic to decide_alg. */
22433 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22434 && (!issetmem || orig_value == const0_rtx))
22435 mode = SImode;
22437 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22438 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22440 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22441 GET_MODE_SIZE (mode)));
22442 if (mode != QImode)
22444 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22445 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22446 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22448 else
22449 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22450 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22452 rounded_count = (INTVAL (count)
22453 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22454 destmem = shallow_copy_rtx (destmem);
22455 set_mem_size (destmem, rounded_count);
22457 else if (MEM_SIZE_KNOWN_P (destmem))
22458 clear_mem_size (destmem);
22460 if (issetmem)
22462 value = force_reg (mode, gen_lowpart (mode, value));
22463 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22465 else
22467 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22468 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22469 if (mode != QImode)
22471 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22472 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22473 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22475 else
22476 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22477 if (CONST_INT_P (count))
22479 rounded_count = (INTVAL (count)
22480 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22481 srcmem = shallow_copy_rtx (srcmem);
22482 set_mem_size (srcmem, rounded_count);
22484 else
22486 if (MEM_SIZE_KNOWN_P (srcmem))
22487 clear_mem_size (srcmem);
22489 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22490 destexp, srcexp));
22494 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22495 DESTMEM.
22496 SRC is passed by pointer to be updated on return.
22497 Return value is updated DST. */
22498 static rtx
22499 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22500 HOST_WIDE_INT size_to_move)
22502 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22503 enum insn_code code;
22504 enum machine_mode move_mode;
22505 int piece_size, i;
22507 /* Find the widest mode in which we could perform moves.
22508 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22509 it until move of such size is supported. */
22510 piece_size = 1 << floor_log2 (size_to_move);
22511 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22512 code = optab_handler (mov_optab, move_mode);
22513 while (code == CODE_FOR_nothing && piece_size > 1)
22515 piece_size >>= 1;
22516 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22517 code = optab_handler (mov_optab, move_mode);
22520 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22521 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22522 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22524 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22525 move_mode = mode_for_vector (word_mode, nunits);
22526 code = optab_handler (mov_optab, move_mode);
22527 if (code == CODE_FOR_nothing)
22529 move_mode = word_mode;
22530 piece_size = GET_MODE_SIZE (move_mode);
22531 code = optab_handler (mov_optab, move_mode);
22534 gcc_assert (code != CODE_FOR_nothing);
22536 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22537 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22539 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22540 gcc_assert (size_to_move % piece_size == 0);
22541 adjust = GEN_INT (piece_size);
22542 for (i = 0; i < size_to_move; i += piece_size)
22544 /* We move from memory to memory, so we'll need to do it via
22545 a temporary register. */
22546 tempreg = gen_reg_rtx (move_mode);
22547 emit_insn (GEN_FCN (code) (tempreg, src));
22548 emit_insn (GEN_FCN (code) (dst, tempreg));
22550 emit_move_insn (destptr,
22551 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22552 emit_move_insn (srcptr,
22553 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22555 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22556 piece_size);
22557 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22558 piece_size);
22561 /* Update DST and SRC rtx. */
22562 *srcmem = src;
22563 return dst;
22566 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22567 static void
22568 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22569 rtx destptr, rtx srcptr, rtx count, int max_size)
22571 rtx src, dest;
22572 if (CONST_INT_P (count))
22574 HOST_WIDE_INT countval = INTVAL (count);
22575 HOST_WIDE_INT epilogue_size = countval % max_size;
22576 int i;
22578 /* For now MAX_SIZE should be a power of 2. This assert could be
22579 relaxed, but it'll require a bit more complicated epilogue
22580 expanding. */
22581 gcc_assert ((max_size & (max_size - 1)) == 0);
22582 for (i = max_size; i >= 1; i >>= 1)
22584 if (epilogue_size & i)
22585 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22587 return;
22589 if (max_size > 8)
22591 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22592 count, 1, OPTAB_DIRECT);
22593 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22594 count, QImode, 1, 4, false);
22595 return;
22598 /* When there are stringops, we can cheaply increase dest and src pointers.
22599 Otherwise we save code size by maintaining offset (zero is readily
22600 available from preceding rep operation) and using x86 addressing modes.
22602 if (TARGET_SINGLE_STRINGOP)
22604 if (max_size > 4)
22606 rtx label = ix86_expand_aligntest (count, 4, true);
22607 src = change_address (srcmem, SImode, srcptr);
22608 dest = change_address (destmem, SImode, destptr);
22609 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22610 emit_label (label);
22611 LABEL_NUSES (label) = 1;
22613 if (max_size > 2)
22615 rtx label = ix86_expand_aligntest (count, 2, true);
22616 src = change_address (srcmem, HImode, srcptr);
22617 dest = change_address (destmem, HImode, destptr);
22618 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22619 emit_label (label);
22620 LABEL_NUSES (label) = 1;
22622 if (max_size > 1)
22624 rtx label = ix86_expand_aligntest (count, 1, true);
22625 src = change_address (srcmem, QImode, srcptr);
22626 dest = change_address (destmem, QImode, destptr);
22627 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22628 emit_label (label);
22629 LABEL_NUSES (label) = 1;
22632 else
22634 rtx offset = force_reg (Pmode, const0_rtx);
22635 rtx tmp;
22637 if (max_size > 4)
22639 rtx label = ix86_expand_aligntest (count, 4, true);
22640 src = change_address (srcmem, SImode, srcptr);
22641 dest = change_address (destmem, SImode, destptr);
22642 emit_move_insn (dest, src);
22643 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22644 true, OPTAB_LIB_WIDEN);
22645 if (tmp != offset)
22646 emit_move_insn (offset, tmp);
22647 emit_label (label);
22648 LABEL_NUSES (label) = 1;
22650 if (max_size > 2)
22652 rtx label = ix86_expand_aligntest (count, 2, true);
22653 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22654 src = change_address (srcmem, HImode, tmp);
22655 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22656 dest = change_address (destmem, HImode, tmp);
22657 emit_move_insn (dest, src);
22658 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22659 true, OPTAB_LIB_WIDEN);
22660 if (tmp != offset)
22661 emit_move_insn (offset, tmp);
22662 emit_label (label);
22663 LABEL_NUSES (label) = 1;
22665 if (max_size > 1)
22667 rtx label = ix86_expand_aligntest (count, 1, true);
22668 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22669 src = change_address (srcmem, QImode, tmp);
22670 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22671 dest = change_address (destmem, QImode, tmp);
22672 emit_move_insn (dest, src);
22673 emit_label (label);
22674 LABEL_NUSES (label) = 1;
22679 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22680 with value PROMOTED_VAL.
22681 SRC is passed by pointer to be updated on return.
22682 Return value is updated DST. */
22683 static rtx
22684 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22685 HOST_WIDE_INT size_to_move)
22687 rtx dst = destmem, adjust;
22688 enum insn_code code;
22689 enum machine_mode move_mode;
22690 int piece_size, i;
22692 /* Find the widest mode in which we could perform moves.
22693 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22694 it until move of such size is supported. */
22695 move_mode = GET_MODE (promoted_val);
22696 if (move_mode == VOIDmode)
22697 move_mode = QImode;
22698 if (size_to_move < GET_MODE_SIZE (move_mode))
22700 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22701 promoted_val = gen_lowpart (move_mode, promoted_val);
22703 piece_size = GET_MODE_SIZE (move_mode);
22704 code = optab_handler (mov_optab, move_mode);
22705 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22707 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22709 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22710 gcc_assert (size_to_move % piece_size == 0);
22711 adjust = GEN_INT (piece_size);
22712 for (i = 0; i < size_to_move; i += piece_size)
22714 if (piece_size <= GET_MODE_SIZE (word_mode))
22716 emit_insn (gen_strset (destptr, dst, promoted_val));
22717 continue;
22720 emit_insn (GEN_FCN (code) (dst, promoted_val));
22722 emit_move_insn (destptr,
22723 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22725 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22726 piece_size);
22729 /* Update DST rtx. */
22730 return dst;
22732 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22733 static void
22734 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22735 rtx count, int max_size)
22737 count =
22738 expand_simple_binop (counter_mode (count), AND, count,
22739 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22740 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22741 gen_lowpart (QImode, value), count, QImode,
22742 1, max_size / 2, true);
22745 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22746 static void
22747 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22748 rtx count, int max_size)
22750 rtx dest;
22752 if (CONST_INT_P (count))
22754 HOST_WIDE_INT countval = INTVAL (count);
22755 HOST_WIDE_INT epilogue_size = countval % max_size;
22756 int i;
22758 /* For now MAX_SIZE should be a power of 2. This assert could be
22759 relaxed, but it'll require a bit more complicated epilogue
22760 expanding. */
22761 gcc_assert ((max_size & (max_size - 1)) == 0);
22762 for (i = max_size; i >= 1; i >>= 1)
22764 if (epilogue_size & i)
22766 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22767 destmem = emit_memset (destmem, destptr, vec_value, i);
22768 else
22769 destmem = emit_memset (destmem, destptr, value, i);
22772 return;
22774 if (max_size > 32)
22776 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22777 return;
22779 if (max_size > 16)
22781 rtx label = ix86_expand_aligntest (count, 16, true);
22782 if (TARGET_64BIT)
22784 dest = change_address (destmem, DImode, destptr);
22785 emit_insn (gen_strset (destptr, dest, value));
22786 emit_insn (gen_strset (destptr, dest, value));
22788 else
22790 dest = change_address (destmem, SImode, destptr);
22791 emit_insn (gen_strset (destptr, dest, value));
22792 emit_insn (gen_strset (destptr, dest, value));
22793 emit_insn (gen_strset (destptr, dest, value));
22794 emit_insn (gen_strset (destptr, dest, value));
22796 emit_label (label);
22797 LABEL_NUSES (label) = 1;
22799 if (max_size > 8)
22801 rtx label = ix86_expand_aligntest (count, 8, true);
22802 if (TARGET_64BIT)
22804 dest = change_address (destmem, DImode, destptr);
22805 emit_insn (gen_strset (destptr, dest, value));
22807 else
22809 dest = change_address (destmem, SImode, destptr);
22810 emit_insn (gen_strset (destptr, dest, value));
22811 emit_insn (gen_strset (destptr, dest, value));
22813 emit_label (label);
22814 LABEL_NUSES (label) = 1;
22816 if (max_size > 4)
22818 rtx label = ix86_expand_aligntest (count, 4, true);
22819 dest = change_address (destmem, SImode, destptr);
22820 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22821 emit_label (label);
22822 LABEL_NUSES (label) = 1;
22824 if (max_size > 2)
22826 rtx label = ix86_expand_aligntest (count, 2, true);
22827 dest = change_address (destmem, HImode, destptr);
22828 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22829 emit_label (label);
22830 LABEL_NUSES (label) = 1;
22832 if (max_size > 1)
22834 rtx label = ix86_expand_aligntest (count, 1, true);
22835 dest = change_address (destmem, QImode, destptr);
22836 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22837 emit_label (label);
22838 LABEL_NUSES (label) = 1;
22842 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
22843 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
22844 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
22845 ignored.
22846 Return value is updated DESTMEM. */
22847 static rtx
22848 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
22849 rtx destptr, rtx srcptr, rtx value,
22850 rtx vec_value, rtx count, int align,
22851 int desired_alignment, bool issetmem)
22853 int i;
22854 for (i = 1; i < desired_alignment; i <<= 1)
22856 if (align <= i)
22858 rtx label = ix86_expand_aligntest (destptr, i, false);
22859 if (issetmem)
22861 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22862 destmem = emit_memset (destmem, destptr, vec_value, i);
22863 else
22864 destmem = emit_memset (destmem, destptr, value, i);
22866 else
22867 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22868 ix86_adjust_counter (count, i);
22869 emit_label (label);
22870 LABEL_NUSES (label) = 1;
22871 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22874 return destmem;
22877 /* Test if COUNT&SIZE is nonzero and if so, expand movme
22878 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
22879 and jump to DONE_LABEL. */
22880 static void
22881 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
22882 rtx destptr, rtx srcptr,
22883 rtx value, rtx vec_value,
22884 rtx count, int size,
22885 rtx done_label, bool issetmem)
22887 rtx label = ix86_expand_aligntest (count, size, false);
22888 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
22889 rtx modesize;
22890 int n;
22892 /* If we do not have vector value to copy, we must reduce size. */
22893 if (issetmem)
22895 if (!vec_value)
22897 if (GET_MODE (value) == VOIDmode && size > 8)
22898 mode = Pmode;
22899 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
22900 mode = GET_MODE (value);
22902 else
22903 mode = GET_MODE (vec_value), value = vec_value;
22905 else
22907 /* Choose appropriate vector mode. */
22908 if (size >= 32)
22909 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
22910 else if (size >= 16)
22911 mode = TARGET_SSE ? V16QImode : DImode;
22912 srcmem = change_address (srcmem, mode, srcptr);
22914 destmem = change_address (destmem, mode, destptr);
22915 modesize = GEN_INT (GET_MODE_SIZE (mode));
22916 gcc_assert (GET_MODE_SIZE (mode) <= size);
22917 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
22919 if (issetmem)
22920 emit_move_insn (destmem, gen_lowpart (mode, value));
22921 else
22923 emit_move_insn (destmem, srcmem);
22924 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
22926 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
22929 destmem = offset_address (destmem, count, 1);
22930 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
22931 GET_MODE_SIZE (mode));
22932 if (issetmem)
22933 emit_move_insn (destmem, gen_lowpart (mode, value));
22934 else
22936 srcmem = offset_address (srcmem, count, 1);
22937 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
22938 GET_MODE_SIZE (mode));
22939 emit_move_insn (destmem, srcmem);
22941 emit_jump_insn (gen_jump (done_label));
22942 emit_barrier ();
22944 emit_label (label);
22945 LABEL_NUSES (label) = 1;
22948 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
22949 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
22950 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
22951 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
22952 DONE_LABEL is a label after the whole copying sequence. The label is created
22953 on demand if *DONE_LABEL is NULL.
22954 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
22955 bounds after the initial copies.
22957 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
22958 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
22959 we will dispatch to a library call for large blocks.
22961 In pseudocode we do:
22963 if (COUNT < SIZE)
22965 Assume that SIZE is 4. Bigger sizes are handled analogously
22966 if (COUNT & 4)
22968 copy 4 bytes from SRCPTR to DESTPTR
22969 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
22970 goto done_label
22972 if (!COUNT)
22973 goto done_label;
22974 copy 1 byte from SRCPTR to DESTPTR
22975 if (COUNT & 2)
22977 copy 2 bytes from SRCPTR to DESTPTR
22978 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
22981 else
22983 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
22984 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
22986 OLD_DESPTR = DESTPTR;
22987 Align DESTPTR up to DESIRED_ALIGN
22988 SRCPTR += DESTPTR - OLD_DESTPTR
22989 COUNT -= DEST_PTR - OLD_DESTPTR
22990 if (DYNAMIC_CHECK)
22991 Round COUNT down to multiple of SIZE
22992 << optional caller supplied zero size guard is here >>
22993 << optional caller suppplied dynamic check is here >>
22994 << caller supplied main copy loop is here >>
22996 done_label:
22998 static void
22999 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
23000 rtx *destptr, rtx *srcptr,
23001 enum machine_mode mode,
23002 rtx value, rtx vec_value,
23003 rtx *count,
23004 rtx *done_label,
23005 int size,
23006 int desired_align,
23007 int align,
23008 unsigned HOST_WIDE_INT *min_size,
23009 bool dynamic_check,
23010 bool issetmem)
23012 rtx loop_label = NULL, label;
23013 int n;
23014 rtx modesize;
23015 int prolog_size = 0;
23016 rtx mode_value;
23018 /* Chose proper value to copy. */
23019 if (issetmem && VECTOR_MODE_P (mode))
23020 mode_value = vec_value;
23021 else
23022 mode_value = value;
23023 gcc_assert (GET_MODE_SIZE (mode) <= size);
23025 /* See if block is big or small, handle small blocks. */
23026 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23028 int size2 = size;
23029 loop_label = gen_label_rtx ();
23031 if (!*done_label)
23032 *done_label = gen_label_rtx ();
23034 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23035 1, loop_label);
23036 size2 >>= 1;
23038 /* Handle sizes > 3. */
23039 for (;size2 > 2; size2 >>= 1)
23040 expand_small_movmem_or_setmem (destmem, srcmem,
23041 *destptr, *srcptr,
23042 value, vec_value,
23043 *count,
23044 size2, *done_label, issetmem);
23045 /* Nothing to copy? Jump to DONE_LABEL if so */
23046 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23047 1, *done_label);
23049 /* Do a byte copy. */
23050 destmem = change_address (destmem, QImode, *destptr);
23051 if (issetmem)
23052 emit_move_insn (destmem, gen_lowpart (QImode, value));
23053 else
23055 srcmem = change_address (srcmem, QImode, *srcptr);
23056 emit_move_insn (destmem, srcmem);
23059 /* Handle sizes 2 and 3. */
23060 label = ix86_expand_aligntest (*count, 2, false);
23061 destmem = change_address (destmem, HImode, *destptr);
23062 destmem = offset_address (destmem, *count, 1);
23063 destmem = offset_address (destmem, GEN_INT (-2), 2);
23064 if (issetmem)
23065 emit_move_insn (destmem, gen_lowpart (HImode, value));
23066 else
23068 srcmem = change_address (srcmem, HImode, *srcptr);
23069 srcmem = offset_address (srcmem, *count, 1);
23070 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23071 emit_move_insn (destmem, srcmem);
23074 emit_label (label);
23075 LABEL_NUSES (label) = 1;
23076 emit_jump_insn (gen_jump (*done_label));
23077 emit_barrier ();
23079 else
23080 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23081 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23083 /* Start memcpy for COUNT >= SIZE. */
23084 if (loop_label)
23086 emit_label (loop_label);
23087 LABEL_NUSES (loop_label) = 1;
23090 /* Copy first desired_align bytes. */
23091 if (!issetmem)
23092 srcmem = change_address (srcmem, mode, *srcptr);
23093 destmem = change_address (destmem, mode, *destptr);
23094 modesize = GEN_INT (GET_MODE_SIZE (mode));
23095 for (n = 0; prolog_size < desired_align - align; n++)
23097 if (issetmem)
23098 emit_move_insn (destmem, mode_value);
23099 else
23101 emit_move_insn (destmem, srcmem);
23102 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23104 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23105 prolog_size += GET_MODE_SIZE (mode);
23109 /* Copy last SIZE bytes. */
23110 destmem = offset_address (destmem, *count, 1);
23111 destmem = offset_address (destmem,
23112 GEN_INT (-size - prolog_size),
23114 if (issetmem)
23115 emit_move_insn (destmem, mode_value);
23116 else
23118 srcmem = offset_address (srcmem, *count, 1);
23119 srcmem = offset_address (srcmem,
23120 GEN_INT (-size - prolog_size),
23122 emit_move_insn (destmem, srcmem);
23124 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23126 destmem = offset_address (destmem, modesize, 1);
23127 if (issetmem)
23128 emit_move_insn (destmem, mode_value);
23129 else
23131 srcmem = offset_address (srcmem, modesize, 1);
23132 emit_move_insn (destmem, srcmem);
23136 /* Align destination. */
23137 if (desired_align > 1 && desired_align > align)
23139 rtx saveddest = *destptr;
23141 gcc_assert (desired_align <= size);
23142 /* Align destptr up, place it to new register. */
23143 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23144 GEN_INT (prolog_size),
23145 NULL_RTX, 1, OPTAB_DIRECT);
23146 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23147 GEN_INT (-desired_align),
23148 *destptr, 1, OPTAB_DIRECT);
23149 /* See how many bytes we skipped. */
23150 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23151 *destptr,
23152 saveddest, 1, OPTAB_DIRECT);
23153 /* Adjust srcptr and count. */
23154 if (!issetmem)
23155 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23156 *srcptr, 1, OPTAB_DIRECT);
23157 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23158 saveddest, *count, 1, OPTAB_DIRECT);
23159 /* We copied at most size + prolog_size. */
23160 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23161 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23162 else
23163 *min_size = 0;
23165 /* Our loops always round down the bock size, but for dispatch to library
23166 we need precise value. */
23167 if (dynamic_check)
23168 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23169 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23171 else
23173 gcc_assert (prolog_size == 0);
23174 /* Decrease count, so we won't end up copying last word twice. */
23175 if (!CONST_INT_P (*count))
23176 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23177 constm1_rtx, *count, 1, OPTAB_DIRECT);
23178 else
23179 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23180 if (*min_size)
23181 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23186 /* This function is like the previous one, except here we know how many bytes
23187 need to be copied. That allows us to update alignment not only of DST, which
23188 is returned, but also of SRC, which is passed as a pointer for that
23189 reason. */
23190 static rtx
23191 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23192 rtx srcreg, rtx value, rtx vec_value,
23193 int desired_align, int align_bytes,
23194 bool issetmem)
23196 rtx src = NULL;
23197 rtx orig_dst = dst;
23198 rtx orig_src = NULL;
23199 int piece_size = 1;
23200 int copied_bytes = 0;
23202 if (!issetmem)
23204 gcc_assert (srcp != NULL);
23205 src = *srcp;
23206 orig_src = src;
23209 for (piece_size = 1;
23210 piece_size <= desired_align && copied_bytes < align_bytes;
23211 piece_size <<= 1)
23213 if (align_bytes & piece_size)
23215 if (issetmem)
23217 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23218 dst = emit_memset (dst, destreg, vec_value, piece_size);
23219 else
23220 dst = emit_memset (dst, destreg, value, piece_size);
23222 else
23223 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23224 copied_bytes += piece_size;
23227 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23228 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23229 if (MEM_SIZE_KNOWN_P (orig_dst))
23230 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23232 if (!issetmem)
23234 int src_align_bytes = get_mem_align_offset (src, desired_align
23235 * BITS_PER_UNIT);
23236 if (src_align_bytes >= 0)
23237 src_align_bytes = desired_align - src_align_bytes;
23238 if (src_align_bytes >= 0)
23240 unsigned int src_align;
23241 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23243 if ((src_align_bytes & (src_align - 1))
23244 == (align_bytes & (src_align - 1)))
23245 break;
23247 if (src_align > (unsigned int) desired_align)
23248 src_align = desired_align;
23249 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23250 set_mem_align (src, src_align * BITS_PER_UNIT);
23252 if (MEM_SIZE_KNOWN_P (orig_src))
23253 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23254 *srcp = src;
23257 return dst;
23260 /* Return true if ALG can be used in current context.
23261 Assume we expand memset if MEMSET is true. */
23262 static bool
23263 alg_usable_p (enum stringop_alg alg, bool memset)
23265 if (alg == no_stringop)
23266 return false;
23267 if (alg == vector_loop)
23268 return TARGET_SSE || TARGET_AVX;
23269 /* Algorithms using the rep prefix want at least edi and ecx;
23270 additionally, memset wants eax and memcpy wants esi. Don't
23271 consider such algorithms if the user has appropriated those
23272 registers for their own purposes. */
23273 if (alg == rep_prefix_1_byte
23274 || alg == rep_prefix_4_byte
23275 || alg == rep_prefix_8_byte)
23276 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23277 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23278 return true;
23281 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23282 static enum stringop_alg
23283 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23284 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23285 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23287 const struct stringop_algs * algs;
23288 bool optimize_for_speed;
23289 int max = -1;
23290 const struct processor_costs *cost;
23291 int i;
23292 bool any_alg_usable_p = false;
23294 *noalign = false;
23295 *dynamic_check = -1;
23297 /* Even if the string operation call is cold, we still might spend a lot
23298 of time processing large blocks. */
23299 if (optimize_function_for_size_p (cfun)
23300 || (optimize_insn_for_size_p ()
23301 && (max_size < 256
23302 || (expected_size != -1 && expected_size < 256))))
23303 optimize_for_speed = false;
23304 else
23305 optimize_for_speed = true;
23307 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23308 if (memset)
23309 algs = &cost->memset[TARGET_64BIT != 0];
23310 else
23311 algs = &cost->memcpy[TARGET_64BIT != 0];
23313 /* See maximal size for user defined algorithm. */
23314 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23316 enum stringop_alg candidate = algs->size[i].alg;
23317 bool usable = alg_usable_p (candidate, memset);
23318 any_alg_usable_p |= usable;
23320 if (candidate != libcall && candidate && usable)
23321 max = algs->size[i].max;
23324 /* If expected size is not known but max size is small enough
23325 so inline version is a win, set expected size into
23326 the range. */
23327 if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
23328 expected_size = min_size / 2 + max_size / 2;
23330 /* If user specified the algorithm, honnor it if possible. */
23331 if (ix86_stringop_alg != no_stringop
23332 && alg_usable_p (ix86_stringop_alg, memset))
23333 return ix86_stringop_alg;
23334 /* rep; movq or rep; movl is the smallest variant. */
23335 else if (!optimize_for_speed)
23337 *noalign = true;
23338 if (!count || (count & 3) || (memset && !zero_memset))
23339 return alg_usable_p (rep_prefix_1_byte, memset)
23340 ? rep_prefix_1_byte : loop_1_byte;
23341 else
23342 return alg_usable_p (rep_prefix_4_byte, memset)
23343 ? rep_prefix_4_byte : loop;
23345 /* Very tiny blocks are best handled via the loop, REP is expensive to
23346 setup. */
23347 else if (expected_size != -1 && expected_size < 4)
23348 return loop_1_byte;
23349 else if (expected_size != -1)
23351 enum stringop_alg alg = libcall;
23352 bool alg_noalign = false;
23353 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23355 /* We get here if the algorithms that were not libcall-based
23356 were rep-prefix based and we are unable to use rep prefixes
23357 based on global register usage. Break out of the loop and
23358 use the heuristic below. */
23359 if (algs->size[i].max == 0)
23360 break;
23361 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23363 enum stringop_alg candidate = algs->size[i].alg;
23365 if (candidate != libcall && alg_usable_p (candidate, memset))
23367 alg = candidate;
23368 alg_noalign = algs->size[i].noalign;
23370 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23371 last non-libcall inline algorithm. */
23372 if (TARGET_INLINE_ALL_STRINGOPS)
23374 /* When the current size is best to be copied by a libcall,
23375 but we are still forced to inline, run the heuristic below
23376 that will pick code for medium sized blocks. */
23377 if (alg != libcall)
23379 *noalign = alg_noalign;
23380 return alg;
23382 break;
23384 else if (alg_usable_p (candidate, memset))
23386 *noalign = algs->size[i].noalign;
23387 return candidate;
23392 /* When asked to inline the call anyway, try to pick meaningful choice.
23393 We look for maximal size of block that is faster to copy by hand and
23394 take blocks of at most of that size guessing that average size will
23395 be roughly half of the block.
23397 If this turns out to be bad, we might simply specify the preferred
23398 choice in ix86_costs. */
23399 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23400 && (algs->unknown_size == libcall
23401 || !alg_usable_p (algs->unknown_size, memset)))
23403 enum stringop_alg alg;
23405 /* If there aren't any usable algorithms, then recursing on
23406 smaller sizes isn't going to find anything. Just return the
23407 simple byte-at-a-time copy loop. */
23408 if (!any_alg_usable_p)
23410 /* Pick something reasonable. */
23411 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23412 *dynamic_check = 128;
23413 return loop_1_byte;
23415 if (max == -1)
23416 max = 4096;
23417 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23418 zero_memset, dynamic_check, noalign);
23419 gcc_assert (*dynamic_check == -1);
23420 gcc_assert (alg != libcall);
23421 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23422 *dynamic_check = max;
23423 return alg;
23425 return (alg_usable_p (algs->unknown_size, memset)
23426 ? algs->unknown_size : libcall);
23429 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23430 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23431 static int
23432 decide_alignment (int align,
23433 enum stringop_alg alg,
23434 int expected_size,
23435 enum machine_mode move_mode)
23437 int desired_align = 0;
23439 gcc_assert (alg != no_stringop);
23441 if (alg == libcall)
23442 return 0;
23443 if (move_mode == VOIDmode)
23444 return 0;
23446 desired_align = GET_MODE_SIZE (move_mode);
23447 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23448 copying whole cacheline at once. */
23449 if (TARGET_PENTIUMPRO
23450 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23451 desired_align = 8;
23453 if (optimize_size)
23454 desired_align = 1;
23455 if (desired_align < align)
23456 desired_align = align;
23457 if (expected_size != -1 && expected_size < 4)
23458 desired_align = align;
23460 return desired_align;
23464 /* Helper function for memcpy. For QImode value 0xXY produce
23465 0xXYXYXYXY of wide specified by MODE. This is essentially
23466 a * 0x10101010, but we can do slightly better than
23467 synth_mult by unwinding the sequence by hand on CPUs with
23468 slow multiply. */
23469 static rtx
23470 promote_duplicated_reg (enum machine_mode mode, rtx val)
23472 enum machine_mode valmode = GET_MODE (val);
23473 rtx tmp;
23474 int nops = mode == DImode ? 3 : 2;
23476 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23477 if (val == const0_rtx)
23478 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23479 if (CONST_INT_P (val))
23481 HOST_WIDE_INT v = INTVAL (val) & 255;
23483 v |= v << 8;
23484 v |= v << 16;
23485 if (mode == DImode)
23486 v |= (v << 16) << 16;
23487 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23490 if (valmode == VOIDmode)
23491 valmode = QImode;
23492 if (valmode != QImode)
23493 val = gen_lowpart (QImode, val);
23494 if (mode == QImode)
23495 return val;
23496 if (!TARGET_PARTIAL_REG_STALL)
23497 nops--;
23498 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23499 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23500 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23501 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23503 rtx reg = convert_modes (mode, QImode, val, true);
23504 tmp = promote_duplicated_reg (mode, const1_rtx);
23505 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23506 OPTAB_DIRECT);
23508 else
23510 rtx reg = convert_modes (mode, QImode, val, true);
23512 if (!TARGET_PARTIAL_REG_STALL)
23513 if (mode == SImode)
23514 emit_insn (gen_movsi_insv_1 (reg, reg));
23515 else
23516 emit_insn (gen_movdi_insv_1 (reg, reg));
23517 else
23519 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23520 NULL, 1, OPTAB_DIRECT);
23521 reg =
23522 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23524 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23525 NULL, 1, OPTAB_DIRECT);
23526 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23527 if (mode == SImode)
23528 return reg;
23529 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23530 NULL, 1, OPTAB_DIRECT);
23531 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23532 return reg;
23536 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23537 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23538 alignment from ALIGN to DESIRED_ALIGN. */
23539 static rtx
23540 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23541 int align)
23543 rtx promoted_val;
23545 if (TARGET_64BIT
23546 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23547 promoted_val = promote_duplicated_reg (DImode, val);
23548 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23549 promoted_val = promote_duplicated_reg (SImode, val);
23550 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23551 promoted_val = promote_duplicated_reg (HImode, val);
23552 else
23553 promoted_val = val;
23555 return promoted_val;
23558 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23559 operations when profitable. The code depends upon architecture, block size
23560 and alignment, but always has one of the following overall structures:
23562 Aligned move sequence:
23564 1) Prologue guard: Conditional that jumps up to epilogues for small
23565 blocks that can be handled by epilogue alone. This is faster
23566 but also needed for correctness, since prologue assume the block
23567 is larger than the desired alignment.
23569 Optional dynamic check for size and libcall for large
23570 blocks is emitted here too, with -minline-stringops-dynamically.
23572 2) Prologue: copy first few bytes in order to get destination
23573 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23574 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23575 copied. We emit either a jump tree on power of two sized
23576 blocks, or a byte loop.
23578 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23579 with specified algorithm.
23581 4) Epilogue: code copying tail of the block that is too small to be
23582 handled by main body (or up to size guarded by prologue guard).
23584 Misaligned move sequence
23586 1) missaligned move prologue/epilogue containing:
23587 a) Prologue handling small memory blocks and jumping to done_label
23588 (skipped if blocks are known to be large enough)
23589 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23590 needed by single possibly misaligned move
23591 (skipped if alignment is not needed)
23592 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23594 2) Zero size guard dispatching to done_label, if needed
23596 3) dispatch to library call, if needed,
23598 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23599 with specified algorithm. */
23600 static bool
23601 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23602 rtx align_exp, rtx expected_align_exp,
23603 rtx expected_size_exp, bool issetmem)
23605 rtx destreg;
23606 rtx srcreg = NULL;
23607 rtx label = NULL;
23608 rtx tmp;
23609 rtx jump_around_label = NULL;
23610 HOST_WIDE_INT align = 1;
23611 unsigned HOST_WIDE_INT count = 0;
23612 HOST_WIDE_INT expected_size = -1;
23613 int size_needed = 0, epilogue_size_needed;
23614 int desired_align = 0, align_bytes = 0;
23615 enum stringop_alg alg;
23616 rtx promoted_val = NULL;
23617 rtx vec_promoted_val = NULL;
23618 bool force_loopy_epilogue = false;
23619 int dynamic_check;
23620 bool need_zero_guard = false;
23621 bool noalign;
23622 enum machine_mode move_mode = VOIDmode;
23623 int unroll_factor = 1;
23624 /* TODO: Once vlaue ranges are available, fill in proper data. */
23625 unsigned HOST_WIDE_INT min_size = 0;
23626 unsigned HOST_WIDE_INT max_size = -1;
23627 bool misaligned_prologue_used = false;
23629 if (CONST_INT_P (align_exp))
23630 align = INTVAL (align_exp);
23631 /* i386 can do misaligned access on reasonably increased cost. */
23632 if (CONST_INT_P (expected_align_exp)
23633 && INTVAL (expected_align_exp) > align)
23634 align = INTVAL (expected_align_exp);
23635 /* ALIGN is the minimum of destination and source alignment, but we care here
23636 just about destination alignment. */
23637 else if (!issetmem
23638 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23639 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23641 if (CONST_INT_P (count_exp))
23642 min_size = max_size = count = expected_size = INTVAL (count_exp);
23643 if (CONST_INT_P (expected_size_exp) && count == 0)
23644 expected_size = INTVAL (expected_size_exp);
23646 /* Make sure we don't need to care about overflow later on. */
23647 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23648 return false;
23650 /* Step 0: Decide on preferred algorithm, desired alignment and
23651 size of chunks to be copied by main loop. */
23652 alg = decide_alg (count, expected_size, min_size, max_size, issetmem,
23653 issetmem && val_exp == const0_rtx,
23654 &dynamic_check, &noalign);
23655 if (alg == libcall)
23656 return false;
23657 gcc_assert (alg != no_stringop);
23659 /* For now vector-version of memset is generated only for memory zeroing, as
23660 creating of promoted vector value is very cheap in this case. */
23661 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23662 alg = unrolled_loop;
23664 if (!count)
23665 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23666 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23667 if (!issetmem)
23668 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23670 unroll_factor = 1;
23671 move_mode = word_mode;
23672 switch (alg)
23674 case libcall:
23675 case no_stringop:
23676 case last_alg:
23677 gcc_unreachable ();
23678 case loop_1_byte:
23679 need_zero_guard = true;
23680 move_mode = QImode;
23681 break;
23682 case loop:
23683 need_zero_guard = true;
23684 break;
23685 case unrolled_loop:
23686 need_zero_guard = true;
23687 unroll_factor = (TARGET_64BIT ? 4 : 2);
23688 break;
23689 case vector_loop:
23690 need_zero_guard = true;
23691 unroll_factor = 4;
23692 /* Find the widest supported mode. */
23693 move_mode = word_mode;
23694 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23695 != CODE_FOR_nothing)
23696 move_mode = GET_MODE_WIDER_MODE (move_mode);
23698 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23699 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23700 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23702 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23703 move_mode = mode_for_vector (word_mode, nunits);
23704 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23705 move_mode = word_mode;
23707 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23708 break;
23709 case rep_prefix_8_byte:
23710 move_mode = DImode;
23711 break;
23712 case rep_prefix_4_byte:
23713 move_mode = SImode;
23714 break;
23715 case rep_prefix_1_byte:
23716 move_mode = QImode;
23717 break;
23719 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23720 epilogue_size_needed = size_needed;
23722 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23723 if (!TARGET_ALIGN_STRINGOPS || noalign)
23724 align = desired_align;
23726 /* Step 1: Prologue guard. */
23728 /* Alignment code needs count to be in register. */
23729 if (CONST_INT_P (count_exp) && desired_align > align)
23731 if (INTVAL (count_exp) > desired_align
23732 && INTVAL (count_exp) > size_needed)
23734 align_bytes
23735 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23736 if (align_bytes <= 0)
23737 align_bytes = 0;
23738 else
23739 align_bytes = desired_align - align_bytes;
23741 if (align_bytes == 0)
23742 count_exp = force_reg (counter_mode (count_exp), count_exp);
23744 gcc_assert (desired_align >= 1 && align >= 1);
23746 /* Misaligned move sequences handles both prologues and epilogues at once.
23747 Default code generation results in smaller code for large alignments and
23748 also avoids redundant job when sizes are known precisely. */
23749 misaligned_prologue_used = (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES
23750 && MAX (desired_align, epilogue_size_needed) <= 32
23751 && ((desired_align > align && !align_bytes)
23752 || (!count && epilogue_size_needed > 1)));
23754 /* Do the cheap promotion to allow better CSE across the
23755 main loop and epilogue (ie one load of the big constant in the
23756 front of all code.
23757 For now the misaligned move sequences do not have fast path
23758 without broadcasting. */
23759 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23761 if (alg == vector_loop)
23763 gcc_assert (val_exp == const0_rtx);
23764 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23765 promoted_val = promote_duplicated_reg_to_size (val_exp,
23766 GET_MODE_SIZE (word_mode),
23767 desired_align, align);
23769 else
23771 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23772 desired_align, align);
23775 /* Misaligned move sequences handles both prologues and epilogues at once.
23776 Default code generation results in smaller code for large alignments and
23777 also avoids redundant job when sizes are known precisely. */
23778 if (misaligned_prologue_used)
23780 /* Misaligned move prologue handled small blocks by itself. */
23781 misaligned_prologue_used = true;
23782 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23783 (dst, src, &destreg, &srcreg,
23784 move_mode, promoted_val, vec_promoted_val,
23785 &count_exp,
23786 &jump_around_label,
23787 desired_align < align
23788 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23789 desired_align, align, &min_size, dynamic_check, issetmem);
23790 if (!issetmem)
23791 src = change_address (src, BLKmode, srcreg);
23792 dst = change_address (dst, BLKmode, destreg);
23793 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23794 epilogue_size_needed = 0;
23795 if (need_zero_guard && !min_size)
23797 /* It is possible that we copied enough so the main loop will not
23798 execute. */
23799 gcc_assert (size_needed > 1);
23800 if (jump_around_label == NULL_RTX)
23801 jump_around_label = gen_label_rtx ();
23802 emit_cmp_and_jump_insns (count_exp,
23803 GEN_INT (size_needed),
23804 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
23805 if (expected_size == -1
23806 || expected_size < (desired_align - align) / 2 + size_needed)
23807 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23808 else
23809 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23812 /* Ensure that alignment prologue won't copy past end of block. */
23813 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23815 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23816 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23817 Make sure it is power of 2. */
23818 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23820 /* To improve performance of small blocks, we jump around the VAL
23821 promoting mode. This mean that if the promoted VAL is not constant,
23822 we might not use it in the epilogue and have to use byte
23823 loop variant. */
23824 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
23825 force_loopy_epilogue = true;
23826 if (count)
23828 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23830 /* If main algorithm works on QImode, no epilogue is needed.
23831 For small sizes just don't align anything. */
23832 if (size_needed == 1)
23833 desired_align = align;
23834 else
23835 goto epilogue;
23838 else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23840 gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
23841 label = gen_label_rtx ();
23842 emit_cmp_and_jump_insns (count_exp,
23843 GEN_INT (epilogue_size_needed),
23844 LTU, 0, counter_mode (count_exp), 1, label);
23845 if (expected_size == -1 || expected_size < epilogue_size_needed)
23846 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23847 else
23848 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23852 /* Emit code to decide on runtime whether library call or inline should be
23853 used. */
23854 if (dynamic_check != -1)
23856 if (!issetmem && CONST_INT_P (count_exp))
23858 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23860 emit_block_move_via_libcall (dst, src, count_exp, false);
23861 count_exp = const0_rtx;
23862 goto epilogue;
23865 else
23867 rtx hot_label = gen_label_rtx ();
23868 jump_around_label = gen_label_rtx ();
23869 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23870 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23871 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23872 if (issetmem)
23873 set_storage_via_libcall (dst, count_exp, val_exp, false);
23874 else
23875 emit_block_move_via_libcall (dst, src, count_exp, false);
23876 emit_jump (jump_around_label);
23877 emit_label (hot_label);
23881 /* Step 2: Alignment prologue. */
23882 /* Do the expensive promotion once we branched off the small blocks. */
23883 if (issetmem && !promoted_val)
23884 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23885 desired_align, align);
23887 if (desired_align > align && !misaligned_prologue_used)
23889 if (align_bytes == 0)
23891 /* Except for the first move in prologue, we no longer know
23892 constant offset in aliasing info. It don't seems to worth
23893 the pain to maintain it for the first move, so throw away
23894 the info early. */
23895 dst = change_address (dst, BLKmode, destreg);
23896 if (!issetmem)
23897 src = change_address (src, BLKmode, srcreg);
23898 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
23899 promoted_val, vec_promoted_val,
23900 count_exp, align, desired_align,
23901 issetmem);
23902 /* At most desired_align - align bytes are copied. */
23903 if (min_size < (unsigned)(desired_align - align))
23904 min_size = 0;
23905 else
23906 min_size -= desired_align - align;
23908 else
23910 /* If we know how many bytes need to be stored before dst is
23911 sufficiently aligned, maintain aliasing info accurately. */
23912 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
23913 srcreg,
23914 promoted_val,
23915 vec_promoted_val,
23916 desired_align,
23917 align_bytes,
23918 issetmem);
23920 count_exp = plus_constant (counter_mode (count_exp),
23921 count_exp, -align_bytes);
23922 count -= align_bytes;
23923 min_size -= align_bytes;
23924 max_size -= align_bytes;
23926 if (need_zero_guard
23927 && !min_size
23928 && (count < (unsigned HOST_WIDE_INT) size_needed
23929 || (align_bytes == 0
23930 && count < ((unsigned HOST_WIDE_INT) size_needed
23931 + desired_align - align))))
23933 /* It is possible that we copied enough so the main loop will not
23934 execute. */
23935 gcc_assert (size_needed > 1);
23936 if (label == NULL_RTX)
23937 label = gen_label_rtx ();
23938 emit_cmp_and_jump_insns (count_exp,
23939 GEN_INT (size_needed),
23940 LTU, 0, counter_mode (count_exp), 1, label);
23941 if (expected_size == -1
23942 || expected_size < (desired_align - align) / 2 + size_needed)
23943 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23944 else
23945 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23948 if (label && size_needed == 1)
23950 emit_label (label);
23951 LABEL_NUSES (label) = 1;
23952 label = NULL;
23953 epilogue_size_needed = 1;
23954 if (issetmem)
23955 promoted_val = val_exp;
23957 else if (label == NULL_RTX && !misaligned_prologue_used)
23958 epilogue_size_needed = size_needed;
23960 /* Step 3: Main loop. */
23962 switch (alg)
23964 case libcall:
23965 case no_stringop:
23966 case last_alg:
23967 gcc_unreachable ();
23968 case loop_1_byte:
23969 case loop:
23970 case unrolled_loop:
23971 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
23972 count_exp, move_mode, unroll_factor,
23973 expected_size, issetmem);
23974 break;
23975 case vector_loop:
23976 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
23977 vec_promoted_val, count_exp, move_mode,
23978 unroll_factor, expected_size, issetmem);
23979 break;
23980 case rep_prefix_8_byte:
23981 case rep_prefix_4_byte:
23982 case rep_prefix_1_byte:
23983 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
23984 val_exp, count_exp, move_mode, issetmem);
23985 break;
23987 /* Adjust properly the offset of src and dest memory for aliasing. */
23988 if (CONST_INT_P (count_exp))
23990 if (!issetmem)
23991 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23992 (count / size_needed) * size_needed);
23993 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23994 (count / size_needed) * size_needed);
23996 else
23998 if (!issetmem)
23999 src = change_address (src, BLKmode, srcreg);
24000 dst = change_address (dst, BLKmode, destreg);
24003 /* Step 4: Epilogue to copy the remaining bytes. */
24004 epilogue:
24005 if (label)
24007 /* When the main loop is done, COUNT_EXP might hold original count,
24008 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
24009 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
24010 bytes. Compensate if needed. */
24012 if (size_needed < epilogue_size_needed)
24014 tmp =
24015 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24016 GEN_INT (size_needed - 1), count_exp, 1,
24017 OPTAB_DIRECT);
24018 if (tmp != count_exp)
24019 emit_move_insn (count_exp, tmp);
24021 emit_label (label);
24022 LABEL_NUSES (label) = 1;
24025 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24027 if (force_loopy_epilogue)
24028 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24029 epilogue_size_needed);
24030 else
24032 if (issetmem)
24033 expand_setmem_epilogue (dst, destreg, promoted_val,
24034 vec_promoted_val, count_exp,
24035 epilogue_size_needed);
24036 else
24037 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24038 epilogue_size_needed);
24041 if (jump_around_label)
24042 emit_label (jump_around_label);
24043 return true;
24046 /* Wrapper for ix86_expand_set_or_movmem for memcpy case. */
24047 bool
24048 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
24049 rtx expected_align_exp, rtx expected_size_exp)
24051 return ix86_expand_set_or_movmem (dst, src, count_exp, NULL, align_exp,
24052 expected_align_exp, expected_size_exp, false);
24055 /* Wrapper for ix86_expand_set_or_movmem for memset case. */
24056 bool
24057 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
24058 rtx expected_align_exp, rtx expected_size_exp)
24060 return ix86_expand_set_or_movmem (dst, NULL, count_exp, val_exp, align_exp,
24061 expected_align_exp, expected_size_exp, true);
24065 /* Expand the appropriate insns for doing strlen if not just doing
24066 repnz; scasb
24068 out = result, initialized with the start address
24069 align_rtx = alignment of the address.
24070 scratch = scratch register, initialized with the startaddress when
24071 not aligned, otherwise undefined
24073 This is just the body. It needs the initializations mentioned above and
24074 some address computing at the end. These things are done in i386.md. */
24076 static void
24077 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24079 int align;
24080 rtx tmp;
24081 rtx align_2_label = NULL_RTX;
24082 rtx align_3_label = NULL_RTX;
24083 rtx align_4_label = gen_label_rtx ();
24084 rtx end_0_label = gen_label_rtx ();
24085 rtx mem;
24086 rtx tmpreg = gen_reg_rtx (SImode);
24087 rtx scratch = gen_reg_rtx (SImode);
24088 rtx cmp;
24090 align = 0;
24091 if (CONST_INT_P (align_rtx))
24092 align = INTVAL (align_rtx);
24094 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24096 /* Is there a known alignment and is it less than 4? */
24097 if (align < 4)
24099 rtx scratch1 = gen_reg_rtx (Pmode);
24100 emit_move_insn (scratch1, out);
24101 /* Is there a known alignment and is it not 2? */
24102 if (align != 2)
24104 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24105 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24107 /* Leave just the 3 lower bits. */
24108 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24109 NULL_RTX, 0, OPTAB_WIDEN);
24111 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24112 Pmode, 1, align_4_label);
24113 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24114 Pmode, 1, align_2_label);
24115 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24116 Pmode, 1, align_3_label);
24118 else
24120 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24121 check if is aligned to 4 - byte. */
24123 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24124 NULL_RTX, 0, OPTAB_WIDEN);
24126 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24127 Pmode, 1, align_4_label);
24130 mem = change_address (src, QImode, out);
24132 /* Now compare the bytes. */
24134 /* Compare the first n unaligned byte on a byte per byte basis. */
24135 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24136 QImode, 1, end_0_label);
24138 /* Increment the address. */
24139 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24141 /* Not needed with an alignment of 2 */
24142 if (align != 2)
24144 emit_label (align_2_label);
24146 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24147 end_0_label);
24149 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24151 emit_label (align_3_label);
24154 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24155 end_0_label);
24157 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24160 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24161 align this loop. It gives only huge programs, but does not help to
24162 speed up. */
24163 emit_label (align_4_label);
24165 mem = change_address (src, SImode, out);
24166 emit_move_insn (scratch, mem);
24167 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24169 /* This formula yields a nonzero result iff one of the bytes is zero.
24170 This saves three branches inside loop and many cycles. */
24172 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24173 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24174 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24175 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24176 gen_int_mode (0x80808080, SImode)));
24177 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24178 align_4_label);
24180 if (TARGET_CMOVE)
24182 rtx reg = gen_reg_rtx (SImode);
24183 rtx reg2 = gen_reg_rtx (Pmode);
24184 emit_move_insn (reg, tmpreg);
24185 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24187 /* If zero is not in the first two bytes, move two bytes forward. */
24188 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24189 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24190 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24191 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24192 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24193 reg,
24194 tmpreg)));
24195 /* Emit lea manually to avoid clobbering of flags. */
24196 emit_insn (gen_rtx_SET (SImode, reg2,
24197 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24199 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24200 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24201 emit_insn (gen_rtx_SET (VOIDmode, out,
24202 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24203 reg2,
24204 out)));
24206 else
24208 rtx end_2_label = gen_label_rtx ();
24209 /* Is zero in the first two bytes? */
24211 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24212 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24213 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24214 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24215 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24216 pc_rtx);
24217 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24218 JUMP_LABEL (tmp) = end_2_label;
24220 /* Not in the first two. Move two bytes forward. */
24221 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24222 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24224 emit_label (end_2_label);
24228 /* Avoid branch in fixing the byte. */
24229 tmpreg = gen_lowpart (QImode, tmpreg);
24230 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24231 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24232 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24233 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24235 emit_label (end_0_label);
24238 /* Expand strlen. */
24240 bool
24241 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24243 rtx addr, scratch1, scratch2, scratch3, scratch4;
24245 /* The generic case of strlen expander is long. Avoid it's
24246 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24248 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24249 && !TARGET_INLINE_ALL_STRINGOPS
24250 && !optimize_insn_for_size_p ()
24251 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24252 return false;
24254 addr = force_reg (Pmode, XEXP (src, 0));
24255 scratch1 = gen_reg_rtx (Pmode);
24257 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24258 && !optimize_insn_for_size_p ())
24260 /* Well it seems that some optimizer does not combine a call like
24261 foo(strlen(bar), strlen(bar));
24262 when the move and the subtraction is done here. It does calculate
24263 the length just once when these instructions are done inside of
24264 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24265 often used and I use one fewer register for the lifetime of
24266 output_strlen_unroll() this is better. */
24268 emit_move_insn (out, addr);
24270 ix86_expand_strlensi_unroll_1 (out, src, align);
24272 /* strlensi_unroll_1 returns the address of the zero at the end of
24273 the string, like memchr(), so compute the length by subtracting
24274 the start address. */
24275 emit_insn (ix86_gen_sub3 (out, out, addr));
24277 else
24279 rtx unspec;
24281 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24282 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24283 return false;
24285 scratch2 = gen_reg_rtx (Pmode);
24286 scratch3 = gen_reg_rtx (Pmode);
24287 scratch4 = force_reg (Pmode, constm1_rtx);
24289 emit_move_insn (scratch3, addr);
24290 eoschar = force_reg (QImode, eoschar);
24292 src = replace_equiv_address_nv (src, scratch3);
24294 /* If .md starts supporting :P, this can be done in .md. */
24295 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24296 scratch4), UNSPEC_SCAS);
24297 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24298 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24299 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24301 return true;
24304 /* For given symbol (function) construct code to compute address of it's PLT
24305 entry in large x86-64 PIC model. */
24306 static rtx
24307 construct_plt_address (rtx symbol)
24309 rtx tmp, unspec;
24311 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24312 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24313 gcc_assert (Pmode == DImode);
24315 tmp = gen_reg_rtx (Pmode);
24316 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24318 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24319 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24320 return tmp;
24324 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24325 rtx callarg2,
24326 rtx pop, bool sibcall)
24328 unsigned int const cregs_size
24329 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24330 rtx vec[3 + cregs_size];
24331 rtx use = NULL, call;
24332 unsigned int vec_len = 0;
24334 if (pop == const0_rtx)
24335 pop = NULL;
24336 gcc_assert (!TARGET_64BIT || !pop);
24338 if (TARGET_MACHO && !TARGET_64BIT)
24340 #if TARGET_MACHO
24341 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24342 fnaddr = machopic_indirect_call_target (fnaddr);
24343 #endif
24345 else
24347 /* Static functions and indirect calls don't need the pic register. */
24348 if (flag_pic
24349 && (!TARGET_64BIT
24350 || (ix86_cmodel == CM_LARGE_PIC
24351 && DEFAULT_ABI != MS_ABI))
24352 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24353 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24354 use_reg (&use, pic_offset_table_rtx);
24357 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24359 rtx al = gen_rtx_REG (QImode, AX_REG);
24360 emit_move_insn (al, callarg2);
24361 use_reg (&use, al);
24364 if (ix86_cmodel == CM_LARGE_PIC
24365 && !TARGET_PECOFF
24366 && MEM_P (fnaddr)
24367 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24368 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24369 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24370 else if (sibcall
24371 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24372 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24374 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24375 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24378 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24379 if (retval)
24380 call = gen_rtx_SET (VOIDmode, retval, call);
24381 vec[vec_len++] = call;
24383 if (pop)
24385 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24386 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24387 vec[vec_len++] = pop;
24390 if (TARGET_64BIT_MS_ABI
24391 && (!callarg2 || INTVAL (callarg2) != -2))
24393 unsigned i;
24395 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24396 UNSPEC_MS_TO_SYSV_CALL);
24398 for (i = 0; i < cregs_size; i++)
24400 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24401 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24403 vec[vec_len++]
24404 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24408 if (vec_len > 1)
24409 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24410 call = emit_call_insn (call);
24411 if (use)
24412 CALL_INSN_FUNCTION_USAGE (call) = use;
24414 return call;
24417 /* Output the assembly for a call instruction. */
24419 const char *
24420 ix86_output_call_insn (rtx insn, rtx call_op)
24422 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24423 bool seh_nop_p = false;
24424 const char *xasm;
24426 if (SIBLING_CALL_P (insn))
24428 if (direct_p)
24429 xasm = "%!jmp\t%P0";
24430 /* SEH epilogue detection requires the indirect branch case
24431 to include REX.W. */
24432 else if (TARGET_SEH)
24433 xasm = "%!rex.W jmp %A0";
24434 else
24435 xasm = "%!jmp\t%A0";
24437 output_asm_insn (xasm, &call_op);
24438 return "";
24441 /* SEH unwinding can require an extra nop to be emitted in several
24442 circumstances. Determine if we have one of those. */
24443 if (TARGET_SEH)
24445 rtx i;
24447 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24449 /* If we get to another real insn, we don't need the nop. */
24450 if (INSN_P (i))
24451 break;
24453 /* If we get to the epilogue note, prevent a catch region from
24454 being adjacent to the standard epilogue sequence. If non-
24455 call-exceptions, we'll have done this during epilogue emission. */
24456 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24457 && !flag_non_call_exceptions
24458 && !can_throw_internal (insn))
24460 seh_nop_p = true;
24461 break;
24465 /* If we didn't find a real insn following the call, prevent the
24466 unwinder from looking into the next function. */
24467 if (i == NULL)
24468 seh_nop_p = true;
24471 if (direct_p)
24472 xasm = "%!call\t%P0";
24473 else
24474 xasm = "%!call\t%A0";
24476 output_asm_insn (xasm, &call_op);
24478 if (seh_nop_p)
24479 return "nop";
24481 return "";
24484 /* Clear stack slot assignments remembered from previous functions.
24485 This is called from INIT_EXPANDERS once before RTL is emitted for each
24486 function. */
24488 static struct machine_function *
24489 ix86_init_machine_status (void)
24491 struct machine_function *f;
24493 f = ggc_alloc_cleared_machine_function ();
24494 f->use_fast_prologue_epilogue_nregs = -1;
24495 f->call_abi = ix86_abi;
24497 return f;
24500 /* Return a MEM corresponding to a stack slot with mode MODE.
24501 Allocate a new slot if necessary.
24503 The RTL for a function can have several slots available: N is
24504 which slot to use. */
24507 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24509 struct stack_local_entry *s;
24511 gcc_assert (n < MAX_386_STACK_LOCALS);
24513 for (s = ix86_stack_locals; s; s = s->next)
24514 if (s->mode == mode && s->n == n)
24515 return validize_mem (copy_rtx (s->rtl));
24517 s = ggc_alloc_stack_local_entry ();
24518 s->n = n;
24519 s->mode = mode;
24520 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24522 s->next = ix86_stack_locals;
24523 ix86_stack_locals = s;
24524 return validize_mem (s->rtl);
24527 static void
24528 ix86_instantiate_decls (void)
24530 struct stack_local_entry *s;
24532 for (s = ix86_stack_locals; s; s = s->next)
24533 if (s->rtl != NULL_RTX)
24534 instantiate_decl_rtl (s->rtl);
24537 /* Calculate the length of the memory address in the instruction encoding.
24538 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24539 or other prefixes. We never generate addr32 prefix for LEA insn. */
24542 memory_address_length (rtx addr, bool lea)
24544 struct ix86_address parts;
24545 rtx base, index, disp;
24546 int len;
24547 int ok;
24549 if (GET_CODE (addr) == PRE_DEC
24550 || GET_CODE (addr) == POST_INC
24551 || GET_CODE (addr) == PRE_MODIFY
24552 || GET_CODE (addr) == POST_MODIFY)
24553 return 0;
24555 ok = ix86_decompose_address (addr, &parts);
24556 gcc_assert (ok);
24558 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24560 /* If this is not LEA instruction, add the length of addr32 prefix. */
24561 if (TARGET_64BIT && !lea
24562 && (SImode_address_operand (addr, VOIDmode)
24563 || (parts.base && GET_MODE (parts.base) == SImode)
24564 || (parts.index && GET_MODE (parts.index) == SImode)))
24565 len++;
24567 base = parts.base;
24568 index = parts.index;
24569 disp = parts.disp;
24571 if (base && GET_CODE (base) == SUBREG)
24572 base = SUBREG_REG (base);
24573 if (index && GET_CODE (index) == SUBREG)
24574 index = SUBREG_REG (index);
24576 gcc_assert (base == NULL_RTX || REG_P (base));
24577 gcc_assert (index == NULL_RTX || REG_P (index));
24579 /* Rule of thumb:
24580 - esp as the base always wants an index,
24581 - ebp as the base always wants a displacement,
24582 - r12 as the base always wants an index,
24583 - r13 as the base always wants a displacement. */
24585 /* Register Indirect. */
24586 if (base && !index && !disp)
24588 /* esp (for its index) and ebp (for its displacement) need
24589 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24590 code. */
24591 if (base == arg_pointer_rtx
24592 || base == frame_pointer_rtx
24593 || REGNO (base) == SP_REG
24594 || REGNO (base) == BP_REG
24595 || REGNO (base) == R12_REG
24596 || REGNO (base) == R13_REG)
24597 len++;
24600 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24601 is not disp32, but disp32(%rip), so for disp32
24602 SIB byte is needed, unless print_operand_address
24603 optimizes it into disp32(%rip) or (%rip) is implied
24604 by UNSPEC. */
24605 else if (disp && !base && !index)
24607 len += 4;
24608 if (TARGET_64BIT)
24610 rtx symbol = disp;
24612 if (GET_CODE (disp) == CONST)
24613 symbol = XEXP (disp, 0);
24614 if (GET_CODE (symbol) == PLUS
24615 && CONST_INT_P (XEXP (symbol, 1)))
24616 symbol = XEXP (symbol, 0);
24618 if (GET_CODE (symbol) != LABEL_REF
24619 && (GET_CODE (symbol) != SYMBOL_REF
24620 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
24621 && (GET_CODE (symbol) != UNSPEC
24622 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
24623 && XINT (symbol, 1) != UNSPEC_PCREL
24624 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
24625 len++;
24628 else
24630 /* Find the length of the displacement constant. */
24631 if (disp)
24633 if (base && satisfies_constraint_K (disp))
24634 len += 1;
24635 else
24636 len += 4;
24638 /* ebp always wants a displacement. Similarly r13. */
24639 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24640 len++;
24642 /* An index requires the two-byte modrm form.... */
24643 if (index
24644 /* ...like esp (or r12), which always wants an index. */
24645 || base == arg_pointer_rtx
24646 || base == frame_pointer_rtx
24647 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24648 len++;
24651 return len;
24654 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24655 is set, expect that insn have 8bit immediate alternative. */
24657 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24659 int len = 0;
24660 int i;
24661 extract_insn_cached (insn);
24662 for (i = recog_data.n_operands - 1; i >= 0; --i)
24663 if (CONSTANT_P (recog_data.operand[i]))
24665 enum attr_mode mode = get_attr_mode (insn);
24667 gcc_assert (!len);
24668 if (shortform && CONST_INT_P (recog_data.operand[i]))
24670 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24671 switch (mode)
24673 case MODE_QI:
24674 len = 1;
24675 continue;
24676 case MODE_HI:
24677 ival = trunc_int_for_mode (ival, HImode);
24678 break;
24679 case MODE_SI:
24680 ival = trunc_int_for_mode (ival, SImode);
24681 break;
24682 default:
24683 break;
24685 if (IN_RANGE (ival, -128, 127))
24687 len = 1;
24688 continue;
24691 switch (mode)
24693 case MODE_QI:
24694 len = 1;
24695 break;
24696 case MODE_HI:
24697 len = 2;
24698 break;
24699 case MODE_SI:
24700 len = 4;
24701 break;
24702 /* Immediates for DImode instructions are encoded
24703 as 32bit sign extended values. */
24704 case MODE_DI:
24705 len = 4;
24706 break;
24707 default:
24708 fatal_insn ("unknown insn mode", insn);
24711 return len;
24714 /* Compute default value for "length_address" attribute. */
24716 ix86_attr_length_address_default (rtx insn)
24718 int i;
24720 if (get_attr_type (insn) == TYPE_LEA)
24722 rtx set = PATTERN (insn), addr;
24724 if (GET_CODE (set) == PARALLEL)
24725 set = XVECEXP (set, 0, 0);
24727 gcc_assert (GET_CODE (set) == SET);
24729 addr = SET_SRC (set);
24731 return memory_address_length (addr, true);
24734 extract_insn_cached (insn);
24735 for (i = recog_data.n_operands - 1; i >= 0; --i)
24736 if (MEM_P (recog_data.operand[i]))
24738 constrain_operands_cached (reload_completed);
24739 if (which_alternative != -1)
24741 const char *constraints = recog_data.constraints[i];
24742 int alt = which_alternative;
24744 while (*constraints == '=' || *constraints == '+')
24745 constraints++;
24746 while (alt-- > 0)
24747 while (*constraints++ != ',')
24749 /* Skip ignored operands. */
24750 if (*constraints == 'X')
24751 continue;
24753 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24755 return 0;
24758 /* Compute default value for "length_vex" attribute. It includes
24759 2 or 3 byte VEX prefix and 1 opcode byte. */
24762 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24764 int i;
24766 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24767 byte VEX prefix. */
24768 if (!has_0f_opcode || has_vex_w)
24769 return 3 + 1;
24771 /* We can always use 2 byte VEX prefix in 32bit. */
24772 if (!TARGET_64BIT)
24773 return 2 + 1;
24775 extract_insn_cached (insn);
24777 for (i = recog_data.n_operands - 1; i >= 0; --i)
24778 if (REG_P (recog_data.operand[i]))
24780 /* REX.W bit uses 3 byte VEX prefix. */
24781 if (GET_MODE (recog_data.operand[i]) == DImode
24782 && GENERAL_REG_P (recog_data.operand[i]))
24783 return 3 + 1;
24785 else
24787 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24788 if (MEM_P (recog_data.operand[i])
24789 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24790 return 3 + 1;
24793 return 2 + 1;
24796 /* Return the maximum number of instructions a cpu can issue. */
24798 static int
24799 ix86_issue_rate (void)
24801 switch (ix86_tune)
24803 case PROCESSOR_PENTIUM:
24804 case PROCESSOR_ATOM:
24805 case PROCESSOR_SLM:
24806 case PROCESSOR_K6:
24807 case PROCESSOR_BTVER2:
24808 case PROCESSOR_PENTIUM4:
24809 case PROCESSOR_NOCONA:
24810 return 2;
24812 case PROCESSOR_PENTIUMPRO:
24813 case PROCESSOR_ATHLON:
24814 case PROCESSOR_K8:
24815 case PROCESSOR_AMDFAM10:
24816 case PROCESSOR_GENERIC:
24817 case PROCESSOR_BDVER1:
24818 case PROCESSOR_BDVER2:
24819 case PROCESSOR_BDVER3:
24820 case PROCESSOR_BTVER1:
24821 return 3;
24823 case PROCESSOR_CORE2:
24824 case PROCESSOR_COREI7:
24825 case PROCESSOR_HASWELL:
24826 return 4;
24828 default:
24829 return 1;
24833 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24834 by DEP_INSN and nothing set by DEP_INSN. */
24836 static bool
24837 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24839 rtx set, set2;
24841 /* Simplify the test for uninteresting insns. */
24842 if (insn_type != TYPE_SETCC
24843 && insn_type != TYPE_ICMOV
24844 && insn_type != TYPE_FCMOV
24845 && insn_type != TYPE_IBR)
24846 return false;
24848 if ((set = single_set (dep_insn)) != 0)
24850 set = SET_DEST (set);
24851 set2 = NULL_RTX;
24853 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24854 && XVECLEN (PATTERN (dep_insn), 0) == 2
24855 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24856 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24858 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24859 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24861 else
24862 return false;
24864 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24865 return false;
24867 /* This test is true if the dependent insn reads the flags but
24868 not any other potentially set register. */
24869 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24870 return false;
24872 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24873 return false;
24875 return true;
24878 /* Return true iff USE_INSN has a memory address with operands set by
24879 SET_INSN. */
24881 bool
24882 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24884 int i;
24885 extract_insn_cached (use_insn);
24886 for (i = recog_data.n_operands - 1; i >= 0; --i)
24887 if (MEM_P (recog_data.operand[i]))
24889 rtx addr = XEXP (recog_data.operand[i], 0);
24890 return modified_in_p (addr, set_insn) != 0;
24892 return false;
24895 /* Helper function for exact_store_load_dependency.
24896 Return true if addr is found in insn. */
24897 static bool
24898 exact_dependency_1 (rtx addr, rtx insn)
24900 enum rtx_code code;
24901 const char *format_ptr;
24902 int i, j;
24904 code = GET_CODE (insn);
24905 switch (code)
24907 case MEM:
24908 if (rtx_equal_p (addr, insn))
24909 return true;
24910 break;
24911 case REG:
24912 CASE_CONST_ANY:
24913 case SYMBOL_REF:
24914 case CODE_LABEL:
24915 case PC:
24916 case CC0:
24917 case EXPR_LIST:
24918 return false;
24919 default:
24920 break;
24923 format_ptr = GET_RTX_FORMAT (code);
24924 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24926 switch (*format_ptr++)
24928 case 'e':
24929 if (exact_dependency_1 (addr, XEXP (insn, i)))
24930 return true;
24931 break;
24932 case 'E':
24933 for (j = 0; j < XVECLEN (insn, i); j++)
24934 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24935 return true;
24936 break;
24939 return false;
24942 /* Return true if there exists exact dependency for store & load, i.e.
24943 the same memory address is used in them. */
24944 static bool
24945 exact_store_load_dependency (rtx store, rtx load)
24947 rtx set1, set2;
24949 set1 = single_set (store);
24950 if (!set1)
24951 return false;
24952 if (!MEM_P (SET_DEST (set1)))
24953 return false;
24954 set2 = single_set (load);
24955 if (!set2)
24956 return false;
24957 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24958 return true;
24959 return false;
24962 static int
24963 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24965 enum attr_type insn_type, dep_insn_type;
24966 enum attr_memory memory;
24967 rtx set, set2;
24968 int dep_insn_code_number;
24970 /* Anti and output dependencies have zero cost on all CPUs. */
24971 if (REG_NOTE_KIND (link) != 0)
24972 return 0;
24974 dep_insn_code_number = recog_memoized (dep_insn);
24976 /* If we can't recognize the insns, we can't really do anything. */
24977 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24978 return cost;
24980 insn_type = get_attr_type (insn);
24981 dep_insn_type = get_attr_type (dep_insn);
24983 switch (ix86_tune)
24985 case PROCESSOR_PENTIUM:
24986 /* Address Generation Interlock adds a cycle of latency. */
24987 if (insn_type == TYPE_LEA)
24989 rtx addr = PATTERN (insn);
24991 if (GET_CODE (addr) == PARALLEL)
24992 addr = XVECEXP (addr, 0, 0);
24994 gcc_assert (GET_CODE (addr) == SET);
24996 addr = SET_SRC (addr);
24997 if (modified_in_p (addr, dep_insn))
24998 cost += 1;
25000 else if (ix86_agi_dependent (dep_insn, insn))
25001 cost += 1;
25003 /* ??? Compares pair with jump/setcc. */
25004 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25005 cost = 0;
25007 /* Floating point stores require value to be ready one cycle earlier. */
25008 if (insn_type == TYPE_FMOV
25009 && get_attr_memory (insn) == MEMORY_STORE
25010 && !ix86_agi_dependent (dep_insn, insn))
25011 cost += 1;
25012 break;
25014 case PROCESSOR_PENTIUMPRO:
25015 memory = get_attr_memory (insn);
25017 /* INT->FP conversion is expensive. */
25018 if (get_attr_fp_int_src (dep_insn))
25019 cost += 5;
25021 /* There is one cycle extra latency between an FP op and a store. */
25022 if (insn_type == TYPE_FMOV
25023 && (set = single_set (dep_insn)) != NULL_RTX
25024 && (set2 = single_set (insn)) != NULL_RTX
25025 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25026 && MEM_P (SET_DEST (set2)))
25027 cost += 1;
25029 /* Show ability of reorder buffer to hide latency of load by executing
25030 in parallel with previous instruction in case
25031 previous instruction is not needed to compute the address. */
25032 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25033 && !ix86_agi_dependent (dep_insn, insn))
25035 /* Claim moves to take one cycle, as core can issue one load
25036 at time and the next load can start cycle later. */
25037 if (dep_insn_type == TYPE_IMOV
25038 || dep_insn_type == TYPE_FMOV)
25039 cost = 1;
25040 else if (cost > 1)
25041 cost--;
25043 break;
25045 case PROCESSOR_K6:
25046 memory = get_attr_memory (insn);
25048 /* The esp dependency is resolved before the instruction is really
25049 finished. */
25050 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25051 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25052 return 1;
25054 /* INT->FP conversion is expensive. */
25055 if (get_attr_fp_int_src (dep_insn))
25056 cost += 5;
25058 /* Show ability of reorder buffer to hide latency of load by executing
25059 in parallel with previous instruction in case
25060 previous instruction is not needed to compute the address. */
25061 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25062 && !ix86_agi_dependent (dep_insn, insn))
25064 /* Claim moves to take one cycle, as core can issue one load
25065 at time and the next load can start cycle later. */
25066 if (dep_insn_type == TYPE_IMOV
25067 || dep_insn_type == TYPE_FMOV)
25068 cost = 1;
25069 else if (cost > 2)
25070 cost -= 2;
25071 else
25072 cost = 1;
25074 break;
25076 case PROCESSOR_ATHLON:
25077 case PROCESSOR_K8:
25078 case PROCESSOR_AMDFAM10:
25079 case PROCESSOR_BDVER1:
25080 case PROCESSOR_BDVER2:
25081 case PROCESSOR_BDVER3:
25082 case PROCESSOR_BTVER1:
25083 case PROCESSOR_BTVER2:
25084 case PROCESSOR_GENERIC:
25085 memory = get_attr_memory (insn);
25087 /* Stack engine allows to execute push&pop instructions in parall. */
25088 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25089 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25090 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25091 return 0;
25093 /* Show ability of reorder buffer to hide latency of load by executing
25094 in parallel with previous instruction in case
25095 previous instruction is not needed to compute the address. */
25096 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25097 && !ix86_agi_dependent (dep_insn, insn))
25099 enum attr_unit unit = get_attr_unit (insn);
25100 int loadcost = 3;
25102 /* Because of the difference between the length of integer and
25103 floating unit pipeline preparation stages, the memory operands
25104 for floating point are cheaper.
25106 ??? For Athlon it the difference is most probably 2. */
25107 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25108 loadcost = 3;
25109 else
25110 loadcost = TARGET_ATHLON ? 2 : 0;
25112 if (cost >= loadcost)
25113 cost -= loadcost;
25114 else
25115 cost = 0;
25117 break;
25119 case PROCESSOR_CORE2:
25120 case PROCESSOR_COREI7:
25121 case PROCESSOR_HASWELL:
25122 memory = get_attr_memory (insn);
25124 /* Stack engine allows to execute push&pop instructions in parall. */
25125 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25126 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25127 return 0;
25129 /* Show ability of reorder buffer to hide latency of load by executing
25130 in parallel with previous instruction in case
25131 previous instruction is not needed to compute the address. */
25132 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25133 && !ix86_agi_dependent (dep_insn, insn))
25135 if (cost >= 4)
25136 cost -= 4;
25137 else
25138 cost = 0;
25140 break;
25142 case PROCESSOR_SLM:
25143 if (!reload_completed)
25144 return cost;
25146 /* Increase cost of integer loads. */
25147 memory = get_attr_memory (dep_insn);
25148 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25150 enum attr_unit unit = get_attr_unit (dep_insn);
25151 if (unit == UNIT_INTEGER && cost == 1)
25153 if (memory == MEMORY_LOAD)
25154 cost = 3;
25155 else
25157 /* Increase cost of ld/st for short int types only
25158 because of store forwarding issue. */
25159 rtx set = single_set (dep_insn);
25160 if (set && (GET_MODE (SET_DEST (set)) == QImode
25161 || GET_MODE (SET_DEST (set)) == HImode))
25163 /* Increase cost of store/load insn if exact
25164 dependence exists and it is load insn. */
25165 enum attr_memory insn_memory = get_attr_memory (insn);
25166 if (insn_memory == MEMORY_LOAD
25167 && exact_store_load_dependency (dep_insn, insn))
25168 cost = 3;
25174 default:
25175 break;
25178 return cost;
25181 /* How many alternative schedules to try. This should be as wide as the
25182 scheduling freedom in the DFA, but no wider. Making this value too
25183 large results extra work for the scheduler. */
25185 static int
25186 ia32_multipass_dfa_lookahead (void)
25188 switch (ix86_tune)
25190 case PROCESSOR_PENTIUM:
25191 return 2;
25193 case PROCESSOR_PENTIUMPRO:
25194 case PROCESSOR_K6:
25195 return 1;
25197 case PROCESSOR_CORE2:
25198 case PROCESSOR_COREI7:
25199 case PROCESSOR_HASWELL:
25200 case PROCESSOR_ATOM:
25201 case PROCESSOR_SLM:
25202 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25203 as many instructions can be executed on a cycle, i.e.,
25204 issue_rate. I wonder why tuning for many CPUs does not do this. */
25205 if (reload_completed)
25206 return ix86_issue_rate ();
25207 /* Don't use lookahead for pre-reload schedule to save compile time. */
25208 return 0;
25210 default:
25211 return 0;
25215 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25216 execution. It is applied if
25217 (1) IMUL instruction is on the top of list;
25218 (2) There exists the only producer of independent IMUL instruction in
25219 ready list.
25220 Return index of IMUL producer if it was found and -1 otherwise. */
25221 static int
25222 do_reorder_for_imul (rtx *ready, int n_ready)
25224 rtx insn, set, insn1, insn2;
25225 sd_iterator_def sd_it;
25226 dep_t dep;
25227 int index = -1;
25228 int i;
25230 if (ix86_tune != PROCESSOR_ATOM)
25231 return index;
25233 /* Check that IMUL instruction is on the top of ready list. */
25234 insn = ready[n_ready - 1];
25235 set = single_set (insn);
25236 if (!set)
25237 return index;
25238 if (!(GET_CODE (SET_SRC (set)) == MULT
25239 && GET_MODE (SET_SRC (set)) == SImode))
25240 return index;
25242 /* Search for producer of independent IMUL instruction. */
25243 for (i = n_ready - 2; i >= 0; i--)
25245 insn = ready[i];
25246 if (!NONDEBUG_INSN_P (insn))
25247 continue;
25248 /* Skip IMUL instruction. */
25249 insn2 = PATTERN (insn);
25250 if (GET_CODE (insn2) == PARALLEL)
25251 insn2 = XVECEXP (insn2, 0, 0);
25252 if (GET_CODE (insn2) == SET
25253 && GET_CODE (SET_SRC (insn2)) == MULT
25254 && GET_MODE (SET_SRC (insn2)) == SImode)
25255 continue;
25257 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25259 rtx con;
25260 con = DEP_CON (dep);
25261 if (!NONDEBUG_INSN_P (con))
25262 continue;
25263 insn1 = PATTERN (con);
25264 if (GET_CODE (insn1) == PARALLEL)
25265 insn1 = XVECEXP (insn1, 0, 0);
25267 if (GET_CODE (insn1) == SET
25268 && GET_CODE (SET_SRC (insn1)) == MULT
25269 && GET_MODE (SET_SRC (insn1)) == SImode)
25271 sd_iterator_def sd_it1;
25272 dep_t dep1;
25273 /* Check if there is no other dependee for IMUL. */
25274 index = i;
25275 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25277 rtx pro;
25278 pro = DEP_PRO (dep1);
25279 if (!NONDEBUG_INSN_P (pro))
25280 continue;
25281 if (pro != insn)
25282 index = -1;
25284 if (index >= 0)
25285 break;
25288 if (index >= 0)
25289 break;
25291 return index;
25294 /* Try to find the best candidate on the top of ready list if two insns
25295 have the same priority - candidate is best if its dependees were
25296 scheduled earlier. Applied for Silvermont only.
25297 Return true if top 2 insns must be interchanged. */
25298 static bool
25299 swap_top_of_ready_list (rtx *ready, int n_ready)
25301 rtx top = ready[n_ready - 1];
25302 rtx next = ready[n_ready - 2];
25303 rtx set;
25304 sd_iterator_def sd_it;
25305 dep_t dep;
25306 int clock1 = -1;
25307 int clock2 = -1;
25308 #define INSN_TICK(INSN) (HID (INSN)->tick)
25310 if (ix86_tune != PROCESSOR_SLM)
25311 return false;
25313 if (!NONDEBUG_INSN_P (top))
25314 return false;
25315 if (!NONJUMP_INSN_P (top))
25316 return false;
25317 if (!NONDEBUG_INSN_P (next))
25318 return false;
25319 if (!NONJUMP_INSN_P (next))
25320 return false;
25321 set = single_set (top);
25322 if (!set)
25323 return false;
25324 set = single_set (next);
25325 if (!set)
25326 return false;
25328 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25330 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25331 return false;
25332 /* Determine winner more precise. */
25333 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25335 rtx pro;
25336 pro = DEP_PRO (dep);
25337 if (!NONDEBUG_INSN_P (pro))
25338 continue;
25339 if (INSN_TICK (pro) > clock1)
25340 clock1 = INSN_TICK (pro);
25342 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25344 rtx pro;
25345 pro = DEP_PRO (dep);
25346 if (!NONDEBUG_INSN_P (pro))
25347 continue;
25348 if (INSN_TICK (pro) > clock2)
25349 clock2 = INSN_TICK (pro);
25352 if (clock1 == clock2)
25354 /* Determine winner - load must win. */
25355 enum attr_memory memory1, memory2;
25356 memory1 = get_attr_memory (top);
25357 memory2 = get_attr_memory (next);
25358 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25359 return true;
25361 return (bool) (clock2 < clock1);
25363 return false;
25364 #undef INSN_TICK
25367 /* Perform possible reodering of ready list for Atom/Silvermont only.
25368 Return issue rate. */
25369 static int
25370 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25371 int clock_var)
25373 int issue_rate = -1;
25374 int n_ready = *pn_ready;
25375 int i;
25376 rtx insn;
25377 int index = -1;
25379 /* Set up issue rate. */
25380 issue_rate = ix86_issue_rate ();
25382 /* Do reodering for Atom/SLM only. */
25383 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25384 return issue_rate;
25386 /* Nothing to do if ready list contains only 1 instruction. */
25387 if (n_ready <= 1)
25388 return issue_rate;
25390 /* Do reodering for post-reload scheduler only. */
25391 if (!reload_completed)
25392 return issue_rate;
25394 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25396 if (sched_verbose > 1)
25397 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25398 INSN_UID (ready[index]));
25400 /* Put IMUL producer (ready[index]) at the top of ready list. */
25401 insn = ready[index];
25402 for (i = index; i < n_ready - 1; i++)
25403 ready[i] = ready[i + 1];
25404 ready[n_ready - 1] = insn;
25405 return issue_rate;
25407 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25409 if (sched_verbose > 1)
25410 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25411 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25412 /* Swap 2 top elements of ready list. */
25413 insn = ready[n_ready - 1];
25414 ready[n_ready - 1] = ready[n_ready - 2];
25415 ready[n_ready - 2] = insn;
25417 return issue_rate;
25420 static bool
25421 ix86_class_likely_spilled_p (reg_class_t);
25423 /* Returns true if lhs of insn is HW function argument register and set up
25424 is_spilled to true if it is likely spilled HW register. */
25425 static bool
25426 insn_is_function_arg (rtx insn, bool* is_spilled)
25428 rtx dst;
25430 if (!NONDEBUG_INSN_P (insn))
25431 return false;
25432 /* Call instructions are not movable, ignore it. */
25433 if (CALL_P (insn))
25434 return false;
25435 insn = PATTERN (insn);
25436 if (GET_CODE (insn) == PARALLEL)
25437 insn = XVECEXP (insn, 0, 0);
25438 if (GET_CODE (insn) != SET)
25439 return false;
25440 dst = SET_DEST (insn);
25441 if (REG_P (dst) && HARD_REGISTER_P (dst)
25442 && ix86_function_arg_regno_p (REGNO (dst)))
25444 /* Is it likely spilled HW register? */
25445 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25446 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25447 *is_spilled = true;
25448 return true;
25450 return false;
25453 /* Add output dependencies for chain of function adjacent arguments if only
25454 there is a move to likely spilled HW register. Return first argument
25455 if at least one dependence was added or NULL otherwise. */
25456 static rtx
25457 add_parameter_dependencies (rtx call, rtx head)
25459 rtx insn;
25460 rtx last = call;
25461 rtx first_arg = NULL;
25462 bool is_spilled = false;
25464 head = PREV_INSN (head);
25466 /* Find nearest to call argument passing instruction. */
25467 while (true)
25469 last = PREV_INSN (last);
25470 if (last == head)
25471 return NULL;
25472 if (!NONDEBUG_INSN_P (last))
25473 continue;
25474 if (insn_is_function_arg (last, &is_spilled))
25475 break;
25476 return NULL;
25479 first_arg = last;
25480 while (true)
25482 insn = PREV_INSN (last);
25483 if (!INSN_P (insn))
25484 break;
25485 if (insn == head)
25486 break;
25487 if (!NONDEBUG_INSN_P (insn))
25489 last = insn;
25490 continue;
25492 if (insn_is_function_arg (insn, &is_spilled))
25494 /* Add output depdendence between two function arguments if chain
25495 of output arguments contains likely spilled HW registers. */
25496 if (is_spilled)
25497 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25498 first_arg = last = insn;
25500 else
25501 break;
25503 if (!is_spilled)
25504 return NULL;
25505 return first_arg;
25508 /* Add output or anti dependency from insn to first_arg to restrict its code
25509 motion. */
25510 static void
25511 avoid_func_arg_motion (rtx first_arg, rtx insn)
25513 rtx set;
25514 rtx tmp;
25516 set = single_set (insn);
25517 if (!set)
25518 return;
25519 tmp = SET_DEST (set);
25520 if (REG_P (tmp))
25522 /* Add output dependency to the first function argument. */
25523 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25524 return;
25526 /* Add anti dependency. */
25527 add_dependence (first_arg, insn, REG_DEP_ANTI);
25530 /* Avoid cross block motion of function argument through adding dependency
25531 from the first non-jump instruction in bb. */
25532 static void
25533 add_dependee_for_func_arg (rtx arg, basic_block bb)
25535 rtx insn = BB_END (bb);
25537 while (insn)
25539 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25541 rtx set = single_set (insn);
25542 if (set)
25544 avoid_func_arg_motion (arg, insn);
25545 return;
25548 if (insn == BB_HEAD (bb))
25549 return;
25550 insn = PREV_INSN (insn);
25554 /* Hook for pre-reload schedule - avoid motion of function arguments
25555 passed in likely spilled HW registers. */
25556 static void
25557 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25559 rtx insn;
25560 rtx first_arg = NULL;
25561 if (reload_completed)
25562 return;
25563 while (head != tail && DEBUG_INSN_P (head))
25564 head = NEXT_INSN (head);
25565 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25566 if (INSN_P (insn) && CALL_P (insn))
25568 first_arg = add_parameter_dependencies (insn, head);
25569 if (first_arg)
25571 /* Add dependee for first argument to predecessors if only
25572 region contains more than one block. */
25573 basic_block bb = BLOCK_FOR_INSN (insn);
25574 int rgn = CONTAINING_RGN (bb->index);
25575 int nr_blks = RGN_NR_BLOCKS (rgn);
25576 /* Skip trivial regions and region head blocks that can have
25577 predecessors outside of region. */
25578 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25580 edge e;
25581 edge_iterator ei;
25582 /* Assume that region is SCC, i.e. all immediate predecessors
25583 of non-head block are in the same region. */
25584 FOR_EACH_EDGE (e, ei, bb->preds)
25586 /* Avoid creating of loop-carried dependencies through
25587 using topological odering in region. */
25588 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25589 add_dependee_for_func_arg (first_arg, e->src);
25592 insn = first_arg;
25593 if (insn == head)
25594 break;
25597 else if (first_arg)
25598 avoid_func_arg_motion (first_arg, insn);
25601 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25602 HW registers to maximum, to schedule them at soon as possible. These are
25603 moves from function argument registers at the top of the function entry
25604 and moves from function return value registers after call. */
25605 static int
25606 ix86_adjust_priority (rtx insn, int priority)
25608 rtx set;
25610 if (reload_completed)
25611 return priority;
25613 if (!NONDEBUG_INSN_P (insn))
25614 return priority;
25616 set = single_set (insn);
25617 if (set)
25619 rtx tmp = SET_SRC (set);
25620 if (REG_P (tmp)
25621 && HARD_REGISTER_P (tmp)
25622 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25623 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25624 return current_sched_info->sched_max_insns_priority;
25627 return priority;
25630 /* Model decoder of Core 2/i7.
25631 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25632 track the instruction fetch block boundaries and make sure that long
25633 (9+ bytes) instructions are assigned to D0. */
25635 /* Maximum length of an insn that can be handled by
25636 a secondary decoder unit. '8' for Core 2/i7. */
25637 static int core2i7_secondary_decoder_max_insn_size;
25639 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25640 '16' for Core 2/i7. */
25641 static int core2i7_ifetch_block_size;
25643 /* Maximum number of instructions decoder can handle per cycle.
25644 '6' for Core 2/i7. */
25645 static int core2i7_ifetch_block_max_insns;
25647 typedef struct ix86_first_cycle_multipass_data_ *
25648 ix86_first_cycle_multipass_data_t;
25649 typedef const struct ix86_first_cycle_multipass_data_ *
25650 const_ix86_first_cycle_multipass_data_t;
25652 /* A variable to store target state across calls to max_issue within
25653 one cycle. */
25654 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25655 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25657 /* Initialize DATA. */
25658 static void
25659 core2i7_first_cycle_multipass_init (void *_data)
25661 ix86_first_cycle_multipass_data_t data
25662 = (ix86_first_cycle_multipass_data_t) _data;
25664 data->ifetch_block_len = 0;
25665 data->ifetch_block_n_insns = 0;
25666 data->ready_try_change = NULL;
25667 data->ready_try_change_size = 0;
25670 /* Advancing the cycle; reset ifetch block counts. */
25671 static void
25672 core2i7_dfa_post_advance_cycle (void)
25674 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25676 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25678 data->ifetch_block_len = 0;
25679 data->ifetch_block_n_insns = 0;
25682 static int min_insn_size (rtx);
25684 /* Filter out insns from ready_try that the core will not be able to issue
25685 on current cycle due to decoder. */
25686 static void
25687 core2i7_first_cycle_multipass_filter_ready_try
25688 (const_ix86_first_cycle_multipass_data_t data,
25689 char *ready_try, int n_ready, bool first_cycle_insn_p)
25691 while (n_ready--)
25693 rtx insn;
25694 int insn_size;
25696 if (ready_try[n_ready])
25697 continue;
25699 insn = get_ready_element (n_ready);
25700 insn_size = min_insn_size (insn);
25702 if (/* If this is a too long an insn for a secondary decoder ... */
25703 (!first_cycle_insn_p
25704 && insn_size > core2i7_secondary_decoder_max_insn_size)
25705 /* ... or it would not fit into the ifetch block ... */
25706 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25707 /* ... or the decoder is full already ... */
25708 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25709 /* ... mask the insn out. */
25711 ready_try[n_ready] = 1;
25713 if (data->ready_try_change)
25714 bitmap_set_bit (data->ready_try_change, n_ready);
25719 /* Prepare for a new round of multipass lookahead scheduling. */
25720 static void
25721 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25722 bool first_cycle_insn_p)
25724 ix86_first_cycle_multipass_data_t data
25725 = (ix86_first_cycle_multipass_data_t) _data;
25726 const_ix86_first_cycle_multipass_data_t prev_data
25727 = ix86_first_cycle_multipass_data;
25729 /* Restore the state from the end of the previous round. */
25730 data->ifetch_block_len = prev_data->ifetch_block_len;
25731 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25733 /* Filter instructions that cannot be issued on current cycle due to
25734 decoder restrictions. */
25735 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25736 first_cycle_insn_p);
25739 /* INSN is being issued in current solution. Account for its impact on
25740 the decoder model. */
25741 static void
25742 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25743 rtx insn, const void *_prev_data)
25745 ix86_first_cycle_multipass_data_t data
25746 = (ix86_first_cycle_multipass_data_t) _data;
25747 const_ix86_first_cycle_multipass_data_t prev_data
25748 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25750 int insn_size = min_insn_size (insn);
25752 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25753 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25754 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25755 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25757 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25758 if (!data->ready_try_change)
25760 data->ready_try_change = sbitmap_alloc (n_ready);
25761 data->ready_try_change_size = n_ready;
25763 else if (data->ready_try_change_size < n_ready)
25765 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25766 n_ready, 0);
25767 data->ready_try_change_size = n_ready;
25769 bitmap_clear (data->ready_try_change);
25771 /* Filter out insns from ready_try that the core will not be able to issue
25772 on current cycle due to decoder. */
25773 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25774 false);
25777 /* Revert the effect on ready_try. */
25778 static void
25779 core2i7_first_cycle_multipass_backtrack (const void *_data,
25780 char *ready_try,
25781 int n_ready ATTRIBUTE_UNUSED)
25783 const_ix86_first_cycle_multipass_data_t data
25784 = (const_ix86_first_cycle_multipass_data_t) _data;
25785 unsigned int i = 0;
25786 sbitmap_iterator sbi;
25788 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25789 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25791 ready_try[i] = 0;
25795 /* Save the result of multipass lookahead scheduling for the next round. */
25796 static void
25797 core2i7_first_cycle_multipass_end (const void *_data)
25799 const_ix86_first_cycle_multipass_data_t data
25800 = (const_ix86_first_cycle_multipass_data_t) _data;
25801 ix86_first_cycle_multipass_data_t next_data
25802 = ix86_first_cycle_multipass_data;
25804 if (data != NULL)
25806 next_data->ifetch_block_len = data->ifetch_block_len;
25807 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25811 /* Deallocate target data. */
25812 static void
25813 core2i7_first_cycle_multipass_fini (void *_data)
25815 ix86_first_cycle_multipass_data_t data
25816 = (ix86_first_cycle_multipass_data_t) _data;
25818 if (data->ready_try_change)
25820 sbitmap_free (data->ready_try_change);
25821 data->ready_try_change = NULL;
25822 data->ready_try_change_size = 0;
25826 /* Prepare for scheduling pass. */
25827 static void
25828 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25829 int verbose ATTRIBUTE_UNUSED,
25830 int max_uid ATTRIBUTE_UNUSED)
25832 /* Install scheduling hooks for current CPU. Some of these hooks are used
25833 in time-critical parts of the scheduler, so we only set them up when
25834 they are actually used. */
25835 switch (ix86_tune)
25837 case PROCESSOR_CORE2:
25838 case PROCESSOR_COREI7:
25839 case PROCESSOR_HASWELL:
25840 /* Do not perform multipass scheduling for pre-reload schedule
25841 to save compile time. */
25842 if (reload_completed)
25844 targetm.sched.dfa_post_advance_cycle
25845 = core2i7_dfa_post_advance_cycle;
25846 targetm.sched.first_cycle_multipass_init
25847 = core2i7_first_cycle_multipass_init;
25848 targetm.sched.first_cycle_multipass_begin
25849 = core2i7_first_cycle_multipass_begin;
25850 targetm.sched.first_cycle_multipass_issue
25851 = core2i7_first_cycle_multipass_issue;
25852 targetm.sched.first_cycle_multipass_backtrack
25853 = core2i7_first_cycle_multipass_backtrack;
25854 targetm.sched.first_cycle_multipass_end
25855 = core2i7_first_cycle_multipass_end;
25856 targetm.sched.first_cycle_multipass_fini
25857 = core2i7_first_cycle_multipass_fini;
25859 /* Set decoder parameters. */
25860 core2i7_secondary_decoder_max_insn_size = 8;
25861 core2i7_ifetch_block_size = 16;
25862 core2i7_ifetch_block_max_insns = 6;
25863 break;
25865 /* ... Fall through ... */
25866 default:
25867 targetm.sched.dfa_post_advance_cycle = NULL;
25868 targetm.sched.first_cycle_multipass_init = NULL;
25869 targetm.sched.first_cycle_multipass_begin = NULL;
25870 targetm.sched.first_cycle_multipass_issue = NULL;
25871 targetm.sched.first_cycle_multipass_backtrack = NULL;
25872 targetm.sched.first_cycle_multipass_end = NULL;
25873 targetm.sched.first_cycle_multipass_fini = NULL;
25874 break;
25879 /* Compute the alignment given to a constant that is being placed in memory.
25880 EXP is the constant and ALIGN is the alignment that the object would
25881 ordinarily have.
25882 The value of this function is used instead of that alignment to align
25883 the object. */
25886 ix86_constant_alignment (tree exp, int align)
25888 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
25889 || TREE_CODE (exp) == INTEGER_CST)
25891 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
25892 return 64;
25893 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
25894 return 128;
25896 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
25897 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
25898 return BITS_PER_WORD;
25900 return align;
25903 /* Compute the alignment for a static variable.
25904 TYPE is the data type, and ALIGN is the alignment that
25905 the object would ordinarily have. The value of this function is used
25906 instead of that alignment to align the object. */
25909 ix86_data_alignment (tree type, int align, bool opt)
25911 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
25913 if (opt
25914 && AGGREGATE_TYPE_P (type)
25915 && TYPE_SIZE (type)
25916 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25917 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
25918 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
25919 && align < max_align)
25920 align = max_align;
25922 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25923 to 16byte boundary. */
25924 if (TARGET_64BIT)
25926 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
25927 && TYPE_SIZE (type)
25928 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25929 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
25930 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25931 return 128;
25934 if (!opt)
25935 return align;
25937 if (TREE_CODE (type) == ARRAY_TYPE)
25939 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25940 return 64;
25941 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25942 return 128;
25944 else if (TREE_CODE (type) == COMPLEX_TYPE)
25947 if (TYPE_MODE (type) == DCmode && align < 64)
25948 return 64;
25949 if ((TYPE_MODE (type) == XCmode
25950 || TYPE_MODE (type) == TCmode) && align < 128)
25951 return 128;
25953 else if ((TREE_CODE (type) == RECORD_TYPE
25954 || TREE_CODE (type) == UNION_TYPE
25955 || TREE_CODE (type) == QUAL_UNION_TYPE)
25956 && TYPE_FIELDS (type))
25958 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25959 return 64;
25960 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25961 return 128;
25963 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25964 || TREE_CODE (type) == INTEGER_TYPE)
25966 if (TYPE_MODE (type) == DFmode && align < 64)
25967 return 64;
25968 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25969 return 128;
25972 return align;
25975 /* Compute the alignment for a local variable or a stack slot. EXP is
25976 the data type or decl itself, MODE is the widest mode available and
25977 ALIGN is the alignment that the object would ordinarily have. The
25978 value of this macro is used instead of that alignment to align the
25979 object. */
25981 unsigned int
25982 ix86_local_alignment (tree exp, enum machine_mode mode,
25983 unsigned int align)
25985 tree type, decl;
25987 if (exp && DECL_P (exp))
25989 type = TREE_TYPE (exp);
25990 decl = exp;
25992 else
25994 type = exp;
25995 decl = NULL;
25998 /* Don't do dynamic stack realignment for long long objects with
25999 -mpreferred-stack-boundary=2. */
26000 if (!TARGET_64BIT
26001 && align == 64
26002 && ix86_preferred_stack_boundary < 64
26003 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26004 && (!type || !TYPE_USER_ALIGN (type))
26005 && (!decl || !DECL_USER_ALIGN (decl)))
26006 align = 32;
26008 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26009 register in MODE. We will return the largest alignment of XF
26010 and DF. */
26011 if (!type)
26013 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26014 align = GET_MODE_ALIGNMENT (DFmode);
26015 return align;
26018 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26019 to 16byte boundary. Exact wording is:
26021 An array uses the same alignment as its elements, except that a local or
26022 global array variable of length at least 16 bytes or
26023 a C99 variable-length array variable always has alignment of at least 16 bytes.
26025 This was added to allow use of aligned SSE instructions at arrays. This
26026 rule is meant for static storage (where compiler can not do the analysis
26027 by itself). We follow it for automatic variables only when convenient.
26028 We fully control everything in the function compiled and functions from
26029 other unit can not rely on the alignment.
26031 Exclude va_list type. It is the common case of local array where
26032 we can not benefit from the alignment.
26034 TODO: Probably one should optimize for size only when var is not escaping. */
26035 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26036 && TARGET_SSE)
26038 if (AGGREGATE_TYPE_P (type)
26039 && (va_list_type_node == NULL_TREE
26040 || (TYPE_MAIN_VARIANT (type)
26041 != TYPE_MAIN_VARIANT (va_list_type_node)))
26042 && TYPE_SIZE (type)
26043 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26044 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
26045 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26046 return 128;
26048 if (TREE_CODE (type) == ARRAY_TYPE)
26050 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26051 return 64;
26052 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26053 return 128;
26055 else if (TREE_CODE (type) == COMPLEX_TYPE)
26057 if (TYPE_MODE (type) == DCmode && align < 64)
26058 return 64;
26059 if ((TYPE_MODE (type) == XCmode
26060 || TYPE_MODE (type) == TCmode) && align < 128)
26061 return 128;
26063 else if ((TREE_CODE (type) == RECORD_TYPE
26064 || TREE_CODE (type) == UNION_TYPE
26065 || TREE_CODE (type) == QUAL_UNION_TYPE)
26066 && TYPE_FIELDS (type))
26068 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26069 return 64;
26070 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26071 return 128;
26073 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26074 || TREE_CODE (type) == INTEGER_TYPE)
26077 if (TYPE_MODE (type) == DFmode && align < 64)
26078 return 64;
26079 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26080 return 128;
26082 return align;
26085 /* Compute the minimum required alignment for dynamic stack realignment
26086 purposes for a local variable, parameter or a stack slot. EXP is
26087 the data type or decl itself, MODE is its mode and ALIGN is the
26088 alignment that the object would ordinarily have. */
26090 unsigned int
26091 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26092 unsigned int align)
26094 tree type, decl;
26096 if (exp && DECL_P (exp))
26098 type = TREE_TYPE (exp);
26099 decl = exp;
26101 else
26103 type = exp;
26104 decl = NULL;
26107 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26108 return align;
26110 /* Don't do dynamic stack realignment for long long objects with
26111 -mpreferred-stack-boundary=2. */
26112 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26113 && (!type || !TYPE_USER_ALIGN (type))
26114 && (!decl || !DECL_USER_ALIGN (decl)))
26115 return 32;
26117 return align;
26120 /* Find a location for the static chain incoming to a nested function.
26121 This is a register, unless all free registers are used by arguments. */
26123 static rtx
26124 ix86_static_chain (const_tree fndecl, bool incoming_p)
26126 unsigned regno;
26128 if (!DECL_STATIC_CHAIN (fndecl))
26129 return NULL;
26131 if (TARGET_64BIT)
26133 /* We always use R10 in 64-bit mode. */
26134 regno = R10_REG;
26136 else
26138 tree fntype;
26139 unsigned int ccvt;
26141 /* By default in 32-bit mode we use ECX to pass the static chain. */
26142 regno = CX_REG;
26144 fntype = TREE_TYPE (fndecl);
26145 ccvt = ix86_get_callcvt (fntype);
26146 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26148 /* Fastcall functions use ecx/edx for arguments, which leaves
26149 us with EAX for the static chain.
26150 Thiscall functions use ecx for arguments, which also
26151 leaves us with EAX for the static chain. */
26152 regno = AX_REG;
26154 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26156 /* Thiscall functions use ecx for arguments, which leaves
26157 us with EAX and EDX for the static chain.
26158 We are using for abi-compatibility EAX. */
26159 regno = AX_REG;
26161 else if (ix86_function_regparm (fntype, fndecl) == 3)
26163 /* For regparm 3, we have no free call-clobbered registers in
26164 which to store the static chain. In order to implement this,
26165 we have the trampoline push the static chain to the stack.
26166 However, we can't push a value below the return address when
26167 we call the nested function directly, so we have to use an
26168 alternate entry point. For this we use ESI, and have the
26169 alternate entry point push ESI, so that things appear the
26170 same once we're executing the nested function. */
26171 if (incoming_p)
26173 if (fndecl == current_function_decl)
26174 ix86_static_chain_on_stack = true;
26175 return gen_frame_mem (SImode,
26176 plus_constant (Pmode,
26177 arg_pointer_rtx, -8));
26179 regno = SI_REG;
26183 return gen_rtx_REG (Pmode, regno);
26186 /* Emit RTL insns to initialize the variable parts of a trampoline.
26187 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26188 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26189 to be passed to the target function. */
26191 static void
26192 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26194 rtx mem, fnaddr;
26195 int opcode;
26196 int offset = 0;
26198 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26200 if (TARGET_64BIT)
26202 int size;
26204 /* Load the function address to r11. Try to load address using
26205 the shorter movl instead of movabs. We may want to support
26206 movq for kernel mode, but kernel does not use trampolines at
26207 the moment. FNADDR is a 32bit address and may not be in
26208 DImode when ptr_mode == SImode. Always use movl in this
26209 case. */
26210 if (ptr_mode == SImode
26211 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26213 fnaddr = copy_addr_to_reg (fnaddr);
26215 mem = adjust_address (m_tramp, HImode, offset);
26216 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26218 mem = adjust_address (m_tramp, SImode, offset + 2);
26219 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26220 offset += 6;
26222 else
26224 mem = adjust_address (m_tramp, HImode, offset);
26225 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26227 mem = adjust_address (m_tramp, DImode, offset + 2);
26228 emit_move_insn (mem, fnaddr);
26229 offset += 10;
26232 /* Load static chain using movabs to r10. Use the shorter movl
26233 instead of movabs when ptr_mode == SImode. */
26234 if (ptr_mode == SImode)
26236 opcode = 0xba41;
26237 size = 6;
26239 else
26241 opcode = 0xba49;
26242 size = 10;
26245 mem = adjust_address (m_tramp, HImode, offset);
26246 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26248 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26249 emit_move_insn (mem, chain_value);
26250 offset += size;
26252 /* Jump to r11; the last (unused) byte is a nop, only there to
26253 pad the write out to a single 32-bit store. */
26254 mem = adjust_address (m_tramp, SImode, offset);
26255 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26256 offset += 4;
26258 else
26260 rtx disp, chain;
26262 /* Depending on the static chain location, either load a register
26263 with a constant, or push the constant to the stack. All of the
26264 instructions are the same size. */
26265 chain = ix86_static_chain (fndecl, true);
26266 if (REG_P (chain))
26268 switch (REGNO (chain))
26270 case AX_REG:
26271 opcode = 0xb8; break;
26272 case CX_REG:
26273 opcode = 0xb9; break;
26274 default:
26275 gcc_unreachable ();
26278 else
26279 opcode = 0x68;
26281 mem = adjust_address (m_tramp, QImode, offset);
26282 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26284 mem = adjust_address (m_tramp, SImode, offset + 1);
26285 emit_move_insn (mem, chain_value);
26286 offset += 5;
26288 mem = adjust_address (m_tramp, QImode, offset);
26289 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26291 mem = adjust_address (m_tramp, SImode, offset + 1);
26293 /* Compute offset from the end of the jmp to the target function.
26294 In the case in which the trampoline stores the static chain on
26295 the stack, we need to skip the first insn which pushes the
26296 (call-saved) register static chain; this push is 1 byte. */
26297 offset += 5;
26298 disp = expand_binop (SImode, sub_optab, fnaddr,
26299 plus_constant (Pmode, XEXP (m_tramp, 0),
26300 offset - (MEM_P (chain) ? 1 : 0)),
26301 NULL_RTX, 1, OPTAB_DIRECT);
26302 emit_move_insn (mem, disp);
26305 gcc_assert (offset <= TRAMPOLINE_SIZE);
26307 #ifdef HAVE_ENABLE_EXECUTE_STACK
26308 #ifdef CHECK_EXECUTE_STACK_ENABLED
26309 if (CHECK_EXECUTE_STACK_ENABLED)
26310 #endif
26311 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26312 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26313 #endif
26316 /* The following file contains several enumerations and data structures
26317 built from the definitions in i386-builtin-types.def. */
26319 #include "i386-builtin-types.inc"
26321 /* Table for the ix86 builtin non-function types. */
26322 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26324 /* Retrieve an element from the above table, building some of
26325 the types lazily. */
26327 static tree
26328 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26330 unsigned int index;
26331 tree type, itype;
26333 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26335 type = ix86_builtin_type_tab[(int) tcode];
26336 if (type != NULL)
26337 return type;
26339 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26340 if (tcode <= IX86_BT_LAST_VECT)
26342 enum machine_mode mode;
26344 index = tcode - IX86_BT_LAST_PRIM - 1;
26345 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26346 mode = ix86_builtin_type_vect_mode[index];
26348 type = build_vector_type_for_mode (itype, mode);
26350 else
26352 int quals;
26354 index = tcode - IX86_BT_LAST_VECT - 1;
26355 if (tcode <= IX86_BT_LAST_PTR)
26356 quals = TYPE_UNQUALIFIED;
26357 else
26358 quals = TYPE_QUAL_CONST;
26360 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26361 if (quals != TYPE_UNQUALIFIED)
26362 itype = build_qualified_type (itype, quals);
26364 type = build_pointer_type (itype);
26367 ix86_builtin_type_tab[(int) tcode] = type;
26368 return type;
26371 /* Table for the ix86 builtin function types. */
26372 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26374 /* Retrieve an element from the above table, building some of
26375 the types lazily. */
26377 static tree
26378 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26380 tree type;
26382 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26384 type = ix86_builtin_func_type_tab[(int) tcode];
26385 if (type != NULL)
26386 return type;
26388 if (tcode <= IX86_BT_LAST_FUNC)
26390 unsigned start = ix86_builtin_func_start[(int) tcode];
26391 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26392 tree rtype, atype, args = void_list_node;
26393 unsigned i;
26395 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26396 for (i = after - 1; i > start; --i)
26398 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26399 args = tree_cons (NULL, atype, args);
26402 type = build_function_type (rtype, args);
26404 else
26406 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26407 enum ix86_builtin_func_type icode;
26409 icode = ix86_builtin_func_alias_base[index];
26410 type = ix86_get_builtin_func_type (icode);
26413 ix86_builtin_func_type_tab[(int) tcode] = type;
26414 return type;
26418 /* Codes for all the SSE/MMX builtins. */
26419 enum ix86_builtins
26421 IX86_BUILTIN_ADDPS,
26422 IX86_BUILTIN_ADDSS,
26423 IX86_BUILTIN_DIVPS,
26424 IX86_BUILTIN_DIVSS,
26425 IX86_BUILTIN_MULPS,
26426 IX86_BUILTIN_MULSS,
26427 IX86_BUILTIN_SUBPS,
26428 IX86_BUILTIN_SUBSS,
26430 IX86_BUILTIN_CMPEQPS,
26431 IX86_BUILTIN_CMPLTPS,
26432 IX86_BUILTIN_CMPLEPS,
26433 IX86_BUILTIN_CMPGTPS,
26434 IX86_BUILTIN_CMPGEPS,
26435 IX86_BUILTIN_CMPNEQPS,
26436 IX86_BUILTIN_CMPNLTPS,
26437 IX86_BUILTIN_CMPNLEPS,
26438 IX86_BUILTIN_CMPNGTPS,
26439 IX86_BUILTIN_CMPNGEPS,
26440 IX86_BUILTIN_CMPORDPS,
26441 IX86_BUILTIN_CMPUNORDPS,
26442 IX86_BUILTIN_CMPEQSS,
26443 IX86_BUILTIN_CMPLTSS,
26444 IX86_BUILTIN_CMPLESS,
26445 IX86_BUILTIN_CMPNEQSS,
26446 IX86_BUILTIN_CMPNLTSS,
26447 IX86_BUILTIN_CMPNLESS,
26448 IX86_BUILTIN_CMPORDSS,
26449 IX86_BUILTIN_CMPUNORDSS,
26451 IX86_BUILTIN_COMIEQSS,
26452 IX86_BUILTIN_COMILTSS,
26453 IX86_BUILTIN_COMILESS,
26454 IX86_BUILTIN_COMIGTSS,
26455 IX86_BUILTIN_COMIGESS,
26456 IX86_BUILTIN_COMINEQSS,
26457 IX86_BUILTIN_UCOMIEQSS,
26458 IX86_BUILTIN_UCOMILTSS,
26459 IX86_BUILTIN_UCOMILESS,
26460 IX86_BUILTIN_UCOMIGTSS,
26461 IX86_BUILTIN_UCOMIGESS,
26462 IX86_BUILTIN_UCOMINEQSS,
26464 IX86_BUILTIN_CVTPI2PS,
26465 IX86_BUILTIN_CVTPS2PI,
26466 IX86_BUILTIN_CVTSI2SS,
26467 IX86_BUILTIN_CVTSI642SS,
26468 IX86_BUILTIN_CVTSS2SI,
26469 IX86_BUILTIN_CVTSS2SI64,
26470 IX86_BUILTIN_CVTTPS2PI,
26471 IX86_BUILTIN_CVTTSS2SI,
26472 IX86_BUILTIN_CVTTSS2SI64,
26474 IX86_BUILTIN_MAXPS,
26475 IX86_BUILTIN_MAXSS,
26476 IX86_BUILTIN_MINPS,
26477 IX86_BUILTIN_MINSS,
26479 IX86_BUILTIN_LOADUPS,
26480 IX86_BUILTIN_STOREUPS,
26481 IX86_BUILTIN_MOVSS,
26483 IX86_BUILTIN_MOVHLPS,
26484 IX86_BUILTIN_MOVLHPS,
26485 IX86_BUILTIN_LOADHPS,
26486 IX86_BUILTIN_LOADLPS,
26487 IX86_BUILTIN_STOREHPS,
26488 IX86_BUILTIN_STORELPS,
26490 IX86_BUILTIN_MASKMOVQ,
26491 IX86_BUILTIN_MOVMSKPS,
26492 IX86_BUILTIN_PMOVMSKB,
26494 IX86_BUILTIN_MOVNTPS,
26495 IX86_BUILTIN_MOVNTQ,
26497 IX86_BUILTIN_LOADDQU,
26498 IX86_BUILTIN_STOREDQU,
26500 IX86_BUILTIN_PACKSSWB,
26501 IX86_BUILTIN_PACKSSDW,
26502 IX86_BUILTIN_PACKUSWB,
26504 IX86_BUILTIN_PADDB,
26505 IX86_BUILTIN_PADDW,
26506 IX86_BUILTIN_PADDD,
26507 IX86_BUILTIN_PADDQ,
26508 IX86_BUILTIN_PADDSB,
26509 IX86_BUILTIN_PADDSW,
26510 IX86_BUILTIN_PADDUSB,
26511 IX86_BUILTIN_PADDUSW,
26512 IX86_BUILTIN_PSUBB,
26513 IX86_BUILTIN_PSUBW,
26514 IX86_BUILTIN_PSUBD,
26515 IX86_BUILTIN_PSUBQ,
26516 IX86_BUILTIN_PSUBSB,
26517 IX86_BUILTIN_PSUBSW,
26518 IX86_BUILTIN_PSUBUSB,
26519 IX86_BUILTIN_PSUBUSW,
26521 IX86_BUILTIN_PAND,
26522 IX86_BUILTIN_PANDN,
26523 IX86_BUILTIN_POR,
26524 IX86_BUILTIN_PXOR,
26526 IX86_BUILTIN_PAVGB,
26527 IX86_BUILTIN_PAVGW,
26529 IX86_BUILTIN_PCMPEQB,
26530 IX86_BUILTIN_PCMPEQW,
26531 IX86_BUILTIN_PCMPEQD,
26532 IX86_BUILTIN_PCMPGTB,
26533 IX86_BUILTIN_PCMPGTW,
26534 IX86_BUILTIN_PCMPGTD,
26536 IX86_BUILTIN_PMADDWD,
26538 IX86_BUILTIN_PMAXSW,
26539 IX86_BUILTIN_PMAXUB,
26540 IX86_BUILTIN_PMINSW,
26541 IX86_BUILTIN_PMINUB,
26543 IX86_BUILTIN_PMULHUW,
26544 IX86_BUILTIN_PMULHW,
26545 IX86_BUILTIN_PMULLW,
26547 IX86_BUILTIN_PSADBW,
26548 IX86_BUILTIN_PSHUFW,
26550 IX86_BUILTIN_PSLLW,
26551 IX86_BUILTIN_PSLLD,
26552 IX86_BUILTIN_PSLLQ,
26553 IX86_BUILTIN_PSRAW,
26554 IX86_BUILTIN_PSRAD,
26555 IX86_BUILTIN_PSRLW,
26556 IX86_BUILTIN_PSRLD,
26557 IX86_BUILTIN_PSRLQ,
26558 IX86_BUILTIN_PSLLWI,
26559 IX86_BUILTIN_PSLLDI,
26560 IX86_BUILTIN_PSLLQI,
26561 IX86_BUILTIN_PSRAWI,
26562 IX86_BUILTIN_PSRADI,
26563 IX86_BUILTIN_PSRLWI,
26564 IX86_BUILTIN_PSRLDI,
26565 IX86_BUILTIN_PSRLQI,
26567 IX86_BUILTIN_PUNPCKHBW,
26568 IX86_BUILTIN_PUNPCKHWD,
26569 IX86_BUILTIN_PUNPCKHDQ,
26570 IX86_BUILTIN_PUNPCKLBW,
26571 IX86_BUILTIN_PUNPCKLWD,
26572 IX86_BUILTIN_PUNPCKLDQ,
26574 IX86_BUILTIN_SHUFPS,
26576 IX86_BUILTIN_RCPPS,
26577 IX86_BUILTIN_RCPSS,
26578 IX86_BUILTIN_RSQRTPS,
26579 IX86_BUILTIN_RSQRTPS_NR,
26580 IX86_BUILTIN_RSQRTSS,
26581 IX86_BUILTIN_RSQRTF,
26582 IX86_BUILTIN_SQRTPS,
26583 IX86_BUILTIN_SQRTPS_NR,
26584 IX86_BUILTIN_SQRTSS,
26586 IX86_BUILTIN_UNPCKHPS,
26587 IX86_BUILTIN_UNPCKLPS,
26589 IX86_BUILTIN_ANDPS,
26590 IX86_BUILTIN_ANDNPS,
26591 IX86_BUILTIN_ORPS,
26592 IX86_BUILTIN_XORPS,
26594 IX86_BUILTIN_EMMS,
26595 IX86_BUILTIN_LDMXCSR,
26596 IX86_BUILTIN_STMXCSR,
26597 IX86_BUILTIN_SFENCE,
26599 IX86_BUILTIN_FXSAVE,
26600 IX86_BUILTIN_FXRSTOR,
26601 IX86_BUILTIN_FXSAVE64,
26602 IX86_BUILTIN_FXRSTOR64,
26604 IX86_BUILTIN_XSAVE,
26605 IX86_BUILTIN_XRSTOR,
26606 IX86_BUILTIN_XSAVE64,
26607 IX86_BUILTIN_XRSTOR64,
26609 IX86_BUILTIN_XSAVEOPT,
26610 IX86_BUILTIN_XSAVEOPT64,
26612 /* 3DNow! Original */
26613 IX86_BUILTIN_FEMMS,
26614 IX86_BUILTIN_PAVGUSB,
26615 IX86_BUILTIN_PF2ID,
26616 IX86_BUILTIN_PFACC,
26617 IX86_BUILTIN_PFADD,
26618 IX86_BUILTIN_PFCMPEQ,
26619 IX86_BUILTIN_PFCMPGE,
26620 IX86_BUILTIN_PFCMPGT,
26621 IX86_BUILTIN_PFMAX,
26622 IX86_BUILTIN_PFMIN,
26623 IX86_BUILTIN_PFMUL,
26624 IX86_BUILTIN_PFRCP,
26625 IX86_BUILTIN_PFRCPIT1,
26626 IX86_BUILTIN_PFRCPIT2,
26627 IX86_BUILTIN_PFRSQIT1,
26628 IX86_BUILTIN_PFRSQRT,
26629 IX86_BUILTIN_PFSUB,
26630 IX86_BUILTIN_PFSUBR,
26631 IX86_BUILTIN_PI2FD,
26632 IX86_BUILTIN_PMULHRW,
26634 /* 3DNow! Athlon Extensions */
26635 IX86_BUILTIN_PF2IW,
26636 IX86_BUILTIN_PFNACC,
26637 IX86_BUILTIN_PFPNACC,
26638 IX86_BUILTIN_PI2FW,
26639 IX86_BUILTIN_PSWAPDSI,
26640 IX86_BUILTIN_PSWAPDSF,
26642 /* SSE2 */
26643 IX86_BUILTIN_ADDPD,
26644 IX86_BUILTIN_ADDSD,
26645 IX86_BUILTIN_DIVPD,
26646 IX86_BUILTIN_DIVSD,
26647 IX86_BUILTIN_MULPD,
26648 IX86_BUILTIN_MULSD,
26649 IX86_BUILTIN_SUBPD,
26650 IX86_BUILTIN_SUBSD,
26652 IX86_BUILTIN_CMPEQPD,
26653 IX86_BUILTIN_CMPLTPD,
26654 IX86_BUILTIN_CMPLEPD,
26655 IX86_BUILTIN_CMPGTPD,
26656 IX86_BUILTIN_CMPGEPD,
26657 IX86_BUILTIN_CMPNEQPD,
26658 IX86_BUILTIN_CMPNLTPD,
26659 IX86_BUILTIN_CMPNLEPD,
26660 IX86_BUILTIN_CMPNGTPD,
26661 IX86_BUILTIN_CMPNGEPD,
26662 IX86_BUILTIN_CMPORDPD,
26663 IX86_BUILTIN_CMPUNORDPD,
26664 IX86_BUILTIN_CMPEQSD,
26665 IX86_BUILTIN_CMPLTSD,
26666 IX86_BUILTIN_CMPLESD,
26667 IX86_BUILTIN_CMPNEQSD,
26668 IX86_BUILTIN_CMPNLTSD,
26669 IX86_BUILTIN_CMPNLESD,
26670 IX86_BUILTIN_CMPORDSD,
26671 IX86_BUILTIN_CMPUNORDSD,
26673 IX86_BUILTIN_COMIEQSD,
26674 IX86_BUILTIN_COMILTSD,
26675 IX86_BUILTIN_COMILESD,
26676 IX86_BUILTIN_COMIGTSD,
26677 IX86_BUILTIN_COMIGESD,
26678 IX86_BUILTIN_COMINEQSD,
26679 IX86_BUILTIN_UCOMIEQSD,
26680 IX86_BUILTIN_UCOMILTSD,
26681 IX86_BUILTIN_UCOMILESD,
26682 IX86_BUILTIN_UCOMIGTSD,
26683 IX86_BUILTIN_UCOMIGESD,
26684 IX86_BUILTIN_UCOMINEQSD,
26686 IX86_BUILTIN_MAXPD,
26687 IX86_BUILTIN_MAXSD,
26688 IX86_BUILTIN_MINPD,
26689 IX86_BUILTIN_MINSD,
26691 IX86_BUILTIN_ANDPD,
26692 IX86_BUILTIN_ANDNPD,
26693 IX86_BUILTIN_ORPD,
26694 IX86_BUILTIN_XORPD,
26696 IX86_BUILTIN_SQRTPD,
26697 IX86_BUILTIN_SQRTSD,
26699 IX86_BUILTIN_UNPCKHPD,
26700 IX86_BUILTIN_UNPCKLPD,
26702 IX86_BUILTIN_SHUFPD,
26704 IX86_BUILTIN_LOADUPD,
26705 IX86_BUILTIN_STOREUPD,
26706 IX86_BUILTIN_MOVSD,
26708 IX86_BUILTIN_LOADHPD,
26709 IX86_BUILTIN_LOADLPD,
26711 IX86_BUILTIN_CVTDQ2PD,
26712 IX86_BUILTIN_CVTDQ2PS,
26714 IX86_BUILTIN_CVTPD2DQ,
26715 IX86_BUILTIN_CVTPD2PI,
26716 IX86_BUILTIN_CVTPD2PS,
26717 IX86_BUILTIN_CVTTPD2DQ,
26718 IX86_BUILTIN_CVTTPD2PI,
26720 IX86_BUILTIN_CVTPI2PD,
26721 IX86_BUILTIN_CVTSI2SD,
26722 IX86_BUILTIN_CVTSI642SD,
26724 IX86_BUILTIN_CVTSD2SI,
26725 IX86_BUILTIN_CVTSD2SI64,
26726 IX86_BUILTIN_CVTSD2SS,
26727 IX86_BUILTIN_CVTSS2SD,
26728 IX86_BUILTIN_CVTTSD2SI,
26729 IX86_BUILTIN_CVTTSD2SI64,
26731 IX86_BUILTIN_CVTPS2DQ,
26732 IX86_BUILTIN_CVTPS2PD,
26733 IX86_BUILTIN_CVTTPS2DQ,
26735 IX86_BUILTIN_MOVNTI,
26736 IX86_BUILTIN_MOVNTI64,
26737 IX86_BUILTIN_MOVNTPD,
26738 IX86_BUILTIN_MOVNTDQ,
26740 IX86_BUILTIN_MOVQ128,
26742 /* SSE2 MMX */
26743 IX86_BUILTIN_MASKMOVDQU,
26744 IX86_BUILTIN_MOVMSKPD,
26745 IX86_BUILTIN_PMOVMSKB128,
26747 IX86_BUILTIN_PACKSSWB128,
26748 IX86_BUILTIN_PACKSSDW128,
26749 IX86_BUILTIN_PACKUSWB128,
26751 IX86_BUILTIN_PADDB128,
26752 IX86_BUILTIN_PADDW128,
26753 IX86_BUILTIN_PADDD128,
26754 IX86_BUILTIN_PADDQ128,
26755 IX86_BUILTIN_PADDSB128,
26756 IX86_BUILTIN_PADDSW128,
26757 IX86_BUILTIN_PADDUSB128,
26758 IX86_BUILTIN_PADDUSW128,
26759 IX86_BUILTIN_PSUBB128,
26760 IX86_BUILTIN_PSUBW128,
26761 IX86_BUILTIN_PSUBD128,
26762 IX86_BUILTIN_PSUBQ128,
26763 IX86_BUILTIN_PSUBSB128,
26764 IX86_BUILTIN_PSUBSW128,
26765 IX86_BUILTIN_PSUBUSB128,
26766 IX86_BUILTIN_PSUBUSW128,
26768 IX86_BUILTIN_PAND128,
26769 IX86_BUILTIN_PANDN128,
26770 IX86_BUILTIN_POR128,
26771 IX86_BUILTIN_PXOR128,
26773 IX86_BUILTIN_PAVGB128,
26774 IX86_BUILTIN_PAVGW128,
26776 IX86_BUILTIN_PCMPEQB128,
26777 IX86_BUILTIN_PCMPEQW128,
26778 IX86_BUILTIN_PCMPEQD128,
26779 IX86_BUILTIN_PCMPGTB128,
26780 IX86_BUILTIN_PCMPGTW128,
26781 IX86_BUILTIN_PCMPGTD128,
26783 IX86_BUILTIN_PMADDWD128,
26785 IX86_BUILTIN_PMAXSW128,
26786 IX86_BUILTIN_PMAXUB128,
26787 IX86_BUILTIN_PMINSW128,
26788 IX86_BUILTIN_PMINUB128,
26790 IX86_BUILTIN_PMULUDQ,
26791 IX86_BUILTIN_PMULUDQ128,
26792 IX86_BUILTIN_PMULHUW128,
26793 IX86_BUILTIN_PMULHW128,
26794 IX86_BUILTIN_PMULLW128,
26796 IX86_BUILTIN_PSADBW128,
26797 IX86_BUILTIN_PSHUFHW,
26798 IX86_BUILTIN_PSHUFLW,
26799 IX86_BUILTIN_PSHUFD,
26801 IX86_BUILTIN_PSLLDQI128,
26802 IX86_BUILTIN_PSLLWI128,
26803 IX86_BUILTIN_PSLLDI128,
26804 IX86_BUILTIN_PSLLQI128,
26805 IX86_BUILTIN_PSRAWI128,
26806 IX86_BUILTIN_PSRADI128,
26807 IX86_BUILTIN_PSRLDQI128,
26808 IX86_BUILTIN_PSRLWI128,
26809 IX86_BUILTIN_PSRLDI128,
26810 IX86_BUILTIN_PSRLQI128,
26812 IX86_BUILTIN_PSLLDQ128,
26813 IX86_BUILTIN_PSLLW128,
26814 IX86_BUILTIN_PSLLD128,
26815 IX86_BUILTIN_PSLLQ128,
26816 IX86_BUILTIN_PSRAW128,
26817 IX86_BUILTIN_PSRAD128,
26818 IX86_BUILTIN_PSRLW128,
26819 IX86_BUILTIN_PSRLD128,
26820 IX86_BUILTIN_PSRLQ128,
26822 IX86_BUILTIN_PUNPCKHBW128,
26823 IX86_BUILTIN_PUNPCKHWD128,
26824 IX86_BUILTIN_PUNPCKHDQ128,
26825 IX86_BUILTIN_PUNPCKHQDQ128,
26826 IX86_BUILTIN_PUNPCKLBW128,
26827 IX86_BUILTIN_PUNPCKLWD128,
26828 IX86_BUILTIN_PUNPCKLDQ128,
26829 IX86_BUILTIN_PUNPCKLQDQ128,
26831 IX86_BUILTIN_CLFLUSH,
26832 IX86_BUILTIN_MFENCE,
26833 IX86_BUILTIN_LFENCE,
26834 IX86_BUILTIN_PAUSE,
26836 IX86_BUILTIN_BSRSI,
26837 IX86_BUILTIN_BSRDI,
26838 IX86_BUILTIN_RDPMC,
26839 IX86_BUILTIN_RDTSC,
26840 IX86_BUILTIN_RDTSCP,
26841 IX86_BUILTIN_ROLQI,
26842 IX86_BUILTIN_ROLHI,
26843 IX86_BUILTIN_RORQI,
26844 IX86_BUILTIN_RORHI,
26846 /* SSE3. */
26847 IX86_BUILTIN_ADDSUBPS,
26848 IX86_BUILTIN_HADDPS,
26849 IX86_BUILTIN_HSUBPS,
26850 IX86_BUILTIN_MOVSHDUP,
26851 IX86_BUILTIN_MOVSLDUP,
26852 IX86_BUILTIN_ADDSUBPD,
26853 IX86_BUILTIN_HADDPD,
26854 IX86_BUILTIN_HSUBPD,
26855 IX86_BUILTIN_LDDQU,
26857 IX86_BUILTIN_MONITOR,
26858 IX86_BUILTIN_MWAIT,
26860 /* SSSE3. */
26861 IX86_BUILTIN_PHADDW,
26862 IX86_BUILTIN_PHADDD,
26863 IX86_BUILTIN_PHADDSW,
26864 IX86_BUILTIN_PHSUBW,
26865 IX86_BUILTIN_PHSUBD,
26866 IX86_BUILTIN_PHSUBSW,
26867 IX86_BUILTIN_PMADDUBSW,
26868 IX86_BUILTIN_PMULHRSW,
26869 IX86_BUILTIN_PSHUFB,
26870 IX86_BUILTIN_PSIGNB,
26871 IX86_BUILTIN_PSIGNW,
26872 IX86_BUILTIN_PSIGND,
26873 IX86_BUILTIN_PALIGNR,
26874 IX86_BUILTIN_PABSB,
26875 IX86_BUILTIN_PABSW,
26876 IX86_BUILTIN_PABSD,
26878 IX86_BUILTIN_PHADDW128,
26879 IX86_BUILTIN_PHADDD128,
26880 IX86_BUILTIN_PHADDSW128,
26881 IX86_BUILTIN_PHSUBW128,
26882 IX86_BUILTIN_PHSUBD128,
26883 IX86_BUILTIN_PHSUBSW128,
26884 IX86_BUILTIN_PMADDUBSW128,
26885 IX86_BUILTIN_PMULHRSW128,
26886 IX86_BUILTIN_PSHUFB128,
26887 IX86_BUILTIN_PSIGNB128,
26888 IX86_BUILTIN_PSIGNW128,
26889 IX86_BUILTIN_PSIGND128,
26890 IX86_BUILTIN_PALIGNR128,
26891 IX86_BUILTIN_PABSB128,
26892 IX86_BUILTIN_PABSW128,
26893 IX86_BUILTIN_PABSD128,
26895 /* AMDFAM10 - SSE4A New Instructions. */
26896 IX86_BUILTIN_MOVNTSD,
26897 IX86_BUILTIN_MOVNTSS,
26898 IX86_BUILTIN_EXTRQI,
26899 IX86_BUILTIN_EXTRQ,
26900 IX86_BUILTIN_INSERTQI,
26901 IX86_BUILTIN_INSERTQ,
26903 /* SSE4.1. */
26904 IX86_BUILTIN_BLENDPD,
26905 IX86_BUILTIN_BLENDPS,
26906 IX86_BUILTIN_BLENDVPD,
26907 IX86_BUILTIN_BLENDVPS,
26908 IX86_BUILTIN_PBLENDVB128,
26909 IX86_BUILTIN_PBLENDW128,
26911 IX86_BUILTIN_DPPD,
26912 IX86_BUILTIN_DPPS,
26914 IX86_BUILTIN_INSERTPS128,
26916 IX86_BUILTIN_MOVNTDQA,
26917 IX86_BUILTIN_MPSADBW128,
26918 IX86_BUILTIN_PACKUSDW128,
26919 IX86_BUILTIN_PCMPEQQ,
26920 IX86_BUILTIN_PHMINPOSUW128,
26922 IX86_BUILTIN_PMAXSB128,
26923 IX86_BUILTIN_PMAXSD128,
26924 IX86_BUILTIN_PMAXUD128,
26925 IX86_BUILTIN_PMAXUW128,
26927 IX86_BUILTIN_PMINSB128,
26928 IX86_BUILTIN_PMINSD128,
26929 IX86_BUILTIN_PMINUD128,
26930 IX86_BUILTIN_PMINUW128,
26932 IX86_BUILTIN_PMOVSXBW128,
26933 IX86_BUILTIN_PMOVSXBD128,
26934 IX86_BUILTIN_PMOVSXBQ128,
26935 IX86_BUILTIN_PMOVSXWD128,
26936 IX86_BUILTIN_PMOVSXWQ128,
26937 IX86_BUILTIN_PMOVSXDQ128,
26939 IX86_BUILTIN_PMOVZXBW128,
26940 IX86_BUILTIN_PMOVZXBD128,
26941 IX86_BUILTIN_PMOVZXBQ128,
26942 IX86_BUILTIN_PMOVZXWD128,
26943 IX86_BUILTIN_PMOVZXWQ128,
26944 IX86_BUILTIN_PMOVZXDQ128,
26946 IX86_BUILTIN_PMULDQ128,
26947 IX86_BUILTIN_PMULLD128,
26949 IX86_BUILTIN_ROUNDSD,
26950 IX86_BUILTIN_ROUNDSS,
26952 IX86_BUILTIN_ROUNDPD,
26953 IX86_BUILTIN_ROUNDPS,
26955 IX86_BUILTIN_FLOORPD,
26956 IX86_BUILTIN_CEILPD,
26957 IX86_BUILTIN_TRUNCPD,
26958 IX86_BUILTIN_RINTPD,
26959 IX86_BUILTIN_ROUNDPD_AZ,
26961 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26962 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26963 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26965 IX86_BUILTIN_FLOORPS,
26966 IX86_BUILTIN_CEILPS,
26967 IX86_BUILTIN_TRUNCPS,
26968 IX86_BUILTIN_RINTPS,
26969 IX86_BUILTIN_ROUNDPS_AZ,
26971 IX86_BUILTIN_FLOORPS_SFIX,
26972 IX86_BUILTIN_CEILPS_SFIX,
26973 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26975 IX86_BUILTIN_PTESTZ,
26976 IX86_BUILTIN_PTESTC,
26977 IX86_BUILTIN_PTESTNZC,
26979 IX86_BUILTIN_VEC_INIT_V2SI,
26980 IX86_BUILTIN_VEC_INIT_V4HI,
26981 IX86_BUILTIN_VEC_INIT_V8QI,
26982 IX86_BUILTIN_VEC_EXT_V2DF,
26983 IX86_BUILTIN_VEC_EXT_V2DI,
26984 IX86_BUILTIN_VEC_EXT_V4SF,
26985 IX86_BUILTIN_VEC_EXT_V4SI,
26986 IX86_BUILTIN_VEC_EXT_V8HI,
26987 IX86_BUILTIN_VEC_EXT_V2SI,
26988 IX86_BUILTIN_VEC_EXT_V4HI,
26989 IX86_BUILTIN_VEC_EXT_V16QI,
26990 IX86_BUILTIN_VEC_SET_V2DI,
26991 IX86_BUILTIN_VEC_SET_V4SF,
26992 IX86_BUILTIN_VEC_SET_V4SI,
26993 IX86_BUILTIN_VEC_SET_V8HI,
26994 IX86_BUILTIN_VEC_SET_V4HI,
26995 IX86_BUILTIN_VEC_SET_V16QI,
26997 IX86_BUILTIN_VEC_PACK_SFIX,
26998 IX86_BUILTIN_VEC_PACK_SFIX256,
27000 /* SSE4.2. */
27001 IX86_BUILTIN_CRC32QI,
27002 IX86_BUILTIN_CRC32HI,
27003 IX86_BUILTIN_CRC32SI,
27004 IX86_BUILTIN_CRC32DI,
27006 IX86_BUILTIN_PCMPESTRI128,
27007 IX86_BUILTIN_PCMPESTRM128,
27008 IX86_BUILTIN_PCMPESTRA128,
27009 IX86_BUILTIN_PCMPESTRC128,
27010 IX86_BUILTIN_PCMPESTRO128,
27011 IX86_BUILTIN_PCMPESTRS128,
27012 IX86_BUILTIN_PCMPESTRZ128,
27013 IX86_BUILTIN_PCMPISTRI128,
27014 IX86_BUILTIN_PCMPISTRM128,
27015 IX86_BUILTIN_PCMPISTRA128,
27016 IX86_BUILTIN_PCMPISTRC128,
27017 IX86_BUILTIN_PCMPISTRO128,
27018 IX86_BUILTIN_PCMPISTRS128,
27019 IX86_BUILTIN_PCMPISTRZ128,
27021 IX86_BUILTIN_PCMPGTQ,
27023 /* AES instructions */
27024 IX86_BUILTIN_AESENC128,
27025 IX86_BUILTIN_AESENCLAST128,
27026 IX86_BUILTIN_AESDEC128,
27027 IX86_BUILTIN_AESDECLAST128,
27028 IX86_BUILTIN_AESIMC128,
27029 IX86_BUILTIN_AESKEYGENASSIST128,
27031 /* PCLMUL instruction */
27032 IX86_BUILTIN_PCLMULQDQ128,
27034 /* AVX */
27035 IX86_BUILTIN_ADDPD256,
27036 IX86_BUILTIN_ADDPS256,
27037 IX86_BUILTIN_ADDSUBPD256,
27038 IX86_BUILTIN_ADDSUBPS256,
27039 IX86_BUILTIN_ANDPD256,
27040 IX86_BUILTIN_ANDPS256,
27041 IX86_BUILTIN_ANDNPD256,
27042 IX86_BUILTIN_ANDNPS256,
27043 IX86_BUILTIN_BLENDPD256,
27044 IX86_BUILTIN_BLENDPS256,
27045 IX86_BUILTIN_BLENDVPD256,
27046 IX86_BUILTIN_BLENDVPS256,
27047 IX86_BUILTIN_DIVPD256,
27048 IX86_BUILTIN_DIVPS256,
27049 IX86_BUILTIN_DPPS256,
27050 IX86_BUILTIN_HADDPD256,
27051 IX86_BUILTIN_HADDPS256,
27052 IX86_BUILTIN_HSUBPD256,
27053 IX86_BUILTIN_HSUBPS256,
27054 IX86_BUILTIN_MAXPD256,
27055 IX86_BUILTIN_MAXPS256,
27056 IX86_BUILTIN_MINPD256,
27057 IX86_BUILTIN_MINPS256,
27058 IX86_BUILTIN_MULPD256,
27059 IX86_BUILTIN_MULPS256,
27060 IX86_BUILTIN_ORPD256,
27061 IX86_BUILTIN_ORPS256,
27062 IX86_BUILTIN_SHUFPD256,
27063 IX86_BUILTIN_SHUFPS256,
27064 IX86_BUILTIN_SUBPD256,
27065 IX86_BUILTIN_SUBPS256,
27066 IX86_BUILTIN_XORPD256,
27067 IX86_BUILTIN_XORPS256,
27068 IX86_BUILTIN_CMPSD,
27069 IX86_BUILTIN_CMPSS,
27070 IX86_BUILTIN_CMPPD,
27071 IX86_BUILTIN_CMPPS,
27072 IX86_BUILTIN_CMPPD256,
27073 IX86_BUILTIN_CMPPS256,
27074 IX86_BUILTIN_CVTDQ2PD256,
27075 IX86_BUILTIN_CVTDQ2PS256,
27076 IX86_BUILTIN_CVTPD2PS256,
27077 IX86_BUILTIN_CVTPS2DQ256,
27078 IX86_BUILTIN_CVTPS2PD256,
27079 IX86_BUILTIN_CVTTPD2DQ256,
27080 IX86_BUILTIN_CVTPD2DQ256,
27081 IX86_BUILTIN_CVTTPS2DQ256,
27082 IX86_BUILTIN_EXTRACTF128PD256,
27083 IX86_BUILTIN_EXTRACTF128PS256,
27084 IX86_BUILTIN_EXTRACTF128SI256,
27085 IX86_BUILTIN_VZEROALL,
27086 IX86_BUILTIN_VZEROUPPER,
27087 IX86_BUILTIN_VPERMILVARPD,
27088 IX86_BUILTIN_VPERMILVARPS,
27089 IX86_BUILTIN_VPERMILVARPD256,
27090 IX86_BUILTIN_VPERMILVARPS256,
27091 IX86_BUILTIN_VPERMILPD,
27092 IX86_BUILTIN_VPERMILPS,
27093 IX86_BUILTIN_VPERMILPD256,
27094 IX86_BUILTIN_VPERMILPS256,
27095 IX86_BUILTIN_VPERMIL2PD,
27096 IX86_BUILTIN_VPERMIL2PS,
27097 IX86_BUILTIN_VPERMIL2PD256,
27098 IX86_BUILTIN_VPERMIL2PS256,
27099 IX86_BUILTIN_VPERM2F128PD256,
27100 IX86_BUILTIN_VPERM2F128PS256,
27101 IX86_BUILTIN_VPERM2F128SI256,
27102 IX86_BUILTIN_VBROADCASTSS,
27103 IX86_BUILTIN_VBROADCASTSD256,
27104 IX86_BUILTIN_VBROADCASTSS256,
27105 IX86_BUILTIN_VBROADCASTPD256,
27106 IX86_BUILTIN_VBROADCASTPS256,
27107 IX86_BUILTIN_VINSERTF128PD256,
27108 IX86_BUILTIN_VINSERTF128PS256,
27109 IX86_BUILTIN_VINSERTF128SI256,
27110 IX86_BUILTIN_LOADUPD256,
27111 IX86_BUILTIN_LOADUPS256,
27112 IX86_BUILTIN_STOREUPD256,
27113 IX86_BUILTIN_STOREUPS256,
27114 IX86_BUILTIN_LDDQU256,
27115 IX86_BUILTIN_MOVNTDQ256,
27116 IX86_BUILTIN_MOVNTPD256,
27117 IX86_BUILTIN_MOVNTPS256,
27118 IX86_BUILTIN_LOADDQU256,
27119 IX86_BUILTIN_STOREDQU256,
27120 IX86_BUILTIN_MASKLOADPD,
27121 IX86_BUILTIN_MASKLOADPS,
27122 IX86_BUILTIN_MASKSTOREPD,
27123 IX86_BUILTIN_MASKSTOREPS,
27124 IX86_BUILTIN_MASKLOADPD256,
27125 IX86_BUILTIN_MASKLOADPS256,
27126 IX86_BUILTIN_MASKSTOREPD256,
27127 IX86_BUILTIN_MASKSTOREPS256,
27128 IX86_BUILTIN_MOVSHDUP256,
27129 IX86_BUILTIN_MOVSLDUP256,
27130 IX86_BUILTIN_MOVDDUP256,
27132 IX86_BUILTIN_SQRTPD256,
27133 IX86_BUILTIN_SQRTPS256,
27134 IX86_BUILTIN_SQRTPS_NR256,
27135 IX86_BUILTIN_RSQRTPS256,
27136 IX86_BUILTIN_RSQRTPS_NR256,
27138 IX86_BUILTIN_RCPPS256,
27140 IX86_BUILTIN_ROUNDPD256,
27141 IX86_BUILTIN_ROUNDPS256,
27143 IX86_BUILTIN_FLOORPD256,
27144 IX86_BUILTIN_CEILPD256,
27145 IX86_BUILTIN_TRUNCPD256,
27146 IX86_BUILTIN_RINTPD256,
27147 IX86_BUILTIN_ROUNDPD_AZ256,
27149 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27150 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27151 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27153 IX86_BUILTIN_FLOORPS256,
27154 IX86_BUILTIN_CEILPS256,
27155 IX86_BUILTIN_TRUNCPS256,
27156 IX86_BUILTIN_RINTPS256,
27157 IX86_BUILTIN_ROUNDPS_AZ256,
27159 IX86_BUILTIN_FLOORPS_SFIX256,
27160 IX86_BUILTIN_CEILPS_SFIX256,
27161 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27163 IX86_BUILTIN_UNPCKHPD256,
27164 IX86_BUILTIN_UNPCKLPD256,
27165 IX86_BUILTIN_UNPCKHPS256,
27166 IX86_BUILTIN_UNPCKLPS256,
27168 IX86_BUILTIN_SI256_SI,
27169 IX86_BUILTIN_PS256_PS,
27170 IX86_BUILTIN_PD256_PD,
27171 IX86_BUILTIN_SI_SI256,
27172 IX86_BUILTIN_PS_PS256,
27173 IX86_BUILTIN_PD_PD256,
27175 IX86_BUILTIN_VTESTZPD,
27176 IX86_BUILTIN_VTESTCPD,
27177 IX86_BUILTIN_VTESTNZCPD,
27178 IX86_BUILTIN_VTESTZPS,
27179 IX86_BUILTIN_VTESTCPS,
27180 IX86_BUILTIN_VTESTNZCPS,
27181 IX86_BUILTIN_VTESTZPD256,
27182 IX86_BUILTIN_VTESTCPD256,
27183 IX86_BUILTIN_VTESTNZCPD256,
27184 IX86_BUILTIN_VTESTZPS256,
27185 IX86_BUILTIN_VTESTCPS256,
27186 IX86_BUILTIN_VTESTNZCPS256,
27187 IX86_BUILTIN_PTESTZ256,
27188 IX86_BUILTIN_PTESTC256,
27189 IX86_BUILTIN_PTESTNZC256,
27191 IX86_BUILTIN_MOVMSKPD256,
27192 IX86_BUILTIN_MOVMSKPS256,
27194 /* AVX2 */
27195 IX86_BUILTIN_MPSADBW256,
27196 IX86_BUILTIN_PABSB256,
27197 IX86_BUILTIN_PABSW256,
27198 IX86_BUILTIN_PABSD256,
27199 IX86_BUILTIN_PACKSSDW256,
27200 IX86_BUILTIN_PACKSSWB256,
27201 IX86_BUILTIN_PACKUSDW256,
27202 IX86_BUILTIN_PACKUSWB256,
27203 IX86_BUILTIN_PADDB256,
27204 IX86_BUILTIN_PADDW256,
27205 IX86_BUILTIN_PADDD256,
27206 IX86_BUILTIN_PADDQ256,
27207 IX86_BUILTIN_PADDSB256,
27208 IX86_BUILTIN_PADDSW256,
27209 IX86_BUILTIN_PADDUSB256,
27210 IX86_BUILTIN_PADDUSW256,
27211 IX86_BUILTIN_PALIGNR256,
27212 IX86_BUILTIN_AND256I,
27213 IX86_BUILTIN_ANDNOT256I,
27214 IX86_BUILTIN_PAVGB256,
27215 IX86_BUILTIN_PAVGW256,
27216 IX86_BUILTIN_PBLENDVB256,
27217 IX86_BUILTIN_PBLENDVW256,
27218 IX86_BUILTIN_PCMPEQB256,
27219 IX86_BUILTIN_PCMPEQW256,
27220 IX86_BUILTIN_PCMPEQD256,
27221 IX86_BUILTIN_PCMPEQQ256,
27222 IX86_BUILTIN_PCMPGTB256,
27223 IX86_BUILTIN_PCMPGTW256,
27224 IX86_BUILTIN_PCMPGTD256,
27225 IX86_BUILTIN_PCMPGTQ256,
27226 IX86_BUILTIN_PHADDW256,
27227 IX86_BUILTIN_PHADDD256,
27228 IX86_BUILTIN_PHADDSW256,
27229 IX86_BUILTIN_PHSUBW256,
27230 IX86_BUILTIN_PHSUBD256,
27231 IX86_BUILTIN_PHSUBSW256,
27232 IX86_BUILTIN_PMADDUBSW256,
27233 IX86_BUILTIN_PMADDWD256,
27234 IX86_BUILTIN_PMAXSB256,
27235 IX86_BUILTIN_PMAXSW256,
27236 IX86_BUILTIN_PMAXSD256,
27237 IX86_BUILTIN_PMAXUB256,
27238 IX86_BUILTIN_PMAXUW256,
27239 IX86_BUILTIN_PMAXUD256,
27240 IX86_BUILTIN_PMINSB256,
27241 IX86_BUILTIN_PMINSW256,
27242 IX86_BUILTIN_PMINSD256,
27243 IX86_BUILTIN_PMINUB256,
27244 IX86_BUILTIN_PMINUW256,
27245 IX86_BUILTIN_PMINUD256,
27246 IX86_BUILTIN_PMOVMSKB256,
27247 IX86_BUILTIN_PMOVSXBW256,
27248 IX86_BUILTIN_PMOVSXBD256,
27249 IX86_BUILTIN_PMOVSXBQ256,
27250 IX86_BUILTIN_PMOVSXWD256,
27251 IX86_BUILTIN_PMOVSXWQ256,
27252 IX86_BUILTIN_PMOVSXDQ256,
27253 IX86_BUILTIN_PMOVZXBW256,
27254 IX86_BUILTIN_PMOVZXBD256,
27255 IX86_BUILTIN_PMOVZXBQ256,
27256 IX86_BUILTIN_PMOVZXWD256,
27257 IX86_BUILTIN_PMOVZXWQ256,
27258 IX86_BUILTIN_PMOVZXDQ256,
27259 IX86_BUILTIN_PMULDQ256,
27260 IX86_BUILTIN_PMULHRSW256,
27261 IX86_BUILTIN_PMULHUW256,
27262 IX86_BUILTIN_PMULHW256,
27263 IX86_BUILTIN_PMULLW256,
27264 IX86_BUILTIN_PMULLD256,
27265 IX86_BUILTIN_PMULUDQ256,
27266 IX86_BUILTIN_POR256,
27267 IX86_BUILTIN_PSADBW256,
27268 IX86_BUILTIN_PSHUFB256,
27269 IX86_BUILTIN_PSHUFD256,
27270 IX86_BUILTIN_PSHUFHW256,
27271 IX86_BUILTIN_PSHUFLW256,
27272 IX86_BUILTIN_PSIGNB256,
27273 IX86_BUILTIN_PSIGNW256,
27274 IX86_BUILTIN_PSIGND256,
27275 IX86_BUILTIN_PSLLDQI256,
27276 IX86_BUILTIN_PSLLWI256,
27277 IX86_BUILTIN_PSLLW256,
27278 IX86_BUILTIN_PSLLDI256,
27279 IX86_BUILTIN_PSLLD256,
27280 IX86_BUILTIN_PSLLQI256,
27281 IX86_BUILTIN_PSLLQ256,
27282 IX86_BUILTIN_PSRAWI256,
27283 IX86_BUILTIN_PSRAW256,
27284 IX86_BUILTIN_PSRADI256,
27285 IX86_BUILTIN_PSRAD256,
27286 IX86_BUILTIN_PSRLDQI256,
27287 IX86_BUILTIN_PSRLWI256,
27288 IX86_BUILTIN_PSRLW256,
27289 IX86_BUILTIN_PSRLDI256,
27290 IX86_BUILTIN_PSRLD256,
27291 IX86_BUILTIN_PSRLQI256,
27292 IX86_BUILTIN_PSRLQ256,
27293 IX86_BUILTIN_PSUBB256,
27294 IX86_BUILTIN_PSUBW256,
27295 IX86_BUILTIN_PSUBD256,
27296 IX86_BUILTIN_PSUBQ256,
27297 IX86_BUILTIN_PSUBSB256,
27298 IX86_BUILTIN_PSUBSW256,
27299 IX86_BUILTIN_PSUBUSB256,
27300 IX86_BUILTIN_PSUBUSW256,
27301 IX86_BUILTIN_PUNPCKHBW256,
27302 IX86_BUILTIN_PUNPCKHWD256,
27303 IX86_BUILTIN_PUNPCKHDQ256,
27304 IX86_BUILTIN_PUNPCKHQDQ256,
27305 IX86_BUILTIN_PUNPCKLBW256,
27306 IX86_BUILTIN_PUNPCKLWD256,
27307 IX86_BUILTIN_PUNPCKLDQ256,
27308 IX86_BUILTIN_PUNPCKLQDQ256,
27309 IX86_BUILTIN_PXOR256,
27310 IX86_BUILTIN_MOVNTDQA256,
27311 IX86_BUILTIN_VBROADCASTSS_PS,
27312 IX86_BUILTIN_VBROADCASTSS_PS256,
27313 IX86_BUILTIN_VBROADCASTSD_PD256,
27314 IX86_BUILTIN_VBROADCASTSI256,
27315 IX86_BUILTIN_PBLENDD256,
27316 IX86_BUILTIN_PBLENDD128,
27317 IX86_BUILTIN_PBROADCASTB256,
27318 IX86_BUILTIN_PBROADCASTW256,
27319 IX86_BUILTIN_PBROADCASTD256,
27320 IX86_BUILTIN_PBROADCASTQ256,
27321 IX86_BUILTIN_PBROADCASTB128,
27322 IX86_BUILTIN_PBROADCASTW128,
27323 IX86_BUILTIN_PBROADCASTD128,
27324 IX86_BUILTIN_PBROADCASTQ128,
27325 IX86_BUILTIN_VPERMVARSI256,
27326 IX86_BUILTIN_VPERMDF256,
27327 IX86_BUILTIN_VPERMVARSF256,
27328 IX86_BUILTIN_VPERMDI256,
27329 IX86_BUILTIN_VPERMTI256,
27330 IX86_BUILTIN_VEXTRACT128I256,
27331 IX86_BUILTIN_VINSERT128I256,
27332 IX86_BUILTIN_MASKLOADD,
27333 IX86_BUILTIN_MASKLOADQ,
27334 IX86_BUILTIN_MASKLOADD256,
27335 IX86_BUILTIN_MASKLOADQ256,
27336 IX86_BUILTIN_MASKSTORED,
27337 IX86_BUILTIN_MASKSTOREQ,
27338 IX86_BUILTIN_MASKSTORED256,
27339 IX86_BUILTIN_MASKSTOREQ256,
27340 IX86_BUILTIN_PSLLVV4DI,
27341 IX86_BUILTIN_PSLLVV2DI,
27342 IX86_BUILTIN_PSLLVV8SI,
27343 IX86_BUILTIN_PSLLVV4SI,
27344 IX86_BUILTIN_PSRAVV8SI,
27345 IX86_BUILTIN_PSRAVV4SI,
27346 IX86_BUILTIN_PSRLVV4DI,
27347 IX86_BUILTIN_PSRLVV2DI,
27348 IX86_BUILTIN_PSRLVV8SI,
27349 IX86_BUILTIN_PSRLVV4SI,
27351 IX86_BUILTIN_GATHERSIV2DF,
27352 IX86_BUILTIN_GATHERSIV4DF,
27353 IX86_BUILTIN_GATHERDIV2DF,
27354 IX86_BUILTIN_GATHERDIV4DF,
27355 IX86_BUILTIN_GATHERSIV4SF,
27356 IX86_BUILTIN_GATHERSIV8SF,
27357 IX86_BUILTIN_GATHERDIV4SF,
27358 IX86_BUILTIN_GATHERDIV8SF,
27359 IX86_BUILTIN_GATHERSIV2DI,
27360 IX86_BUILTIN_GATHERSIV4DI,
27361 IX86_BUILTIN_GATHERDIV2DI,
27362 IX86_BUILTIN_GATHERDIV4DI,
27363 IX86_BUILTIN_GATHERSIV4SI,
27364 IX86_BUILTIN_GATHERSIV8SI,
27365 IX86_BUILTIN_GATHERDIV4SI,
27366 IX86_BUILTIN_GATHERDIV8SI,
27368 /* Alternate 4 element gather for the vectorizer where
27369 all operands are 32-byte wide. */
27370 IX86_BUILTIN_GATHERALTSIV4DF,
27371 IX86_BUILTIN_GATHERALTDIV8SF,
27372 IX86_BUILTIN_GATHERALTSIV4DI,
27373 IX86_BUILTIN_GATHERALTDIV8SI,
27375 /* TFmode support builtins. */
27376 IX86_BUILTIN_INFQ,
27377 IX86_BUILTIN_HUGE_VALQ,
27378 IX86_BUILTIN_FABSQ,
27379 IX86_BUILTIN_COPYSIGNQ,
27381 /* Vectorizer support builtins. */
27382 IX86_BUILTIN_CPYSGNPS,
27383 IX86_BUILTIN_CPYSGNPD,
27384 IX86_BUILTIN_CPYSGNPS256,
27385 IX86_BUILTIN_CPYSGNPD256,
27387 /* FMA4 instructions. */
27388 IX86_BUILTIN_VFMADDSS,
27389 IX86_BUILTIN_VFMADDSD,
27390 IX86_BUILTIN_VFMADDPS,
27391 IX86_BUILTIN_VFMADDPD,
27392 IX86_BUILTIN_VFMADDPS256,
27393 IX86_BUILTIN_VFMADDPD256,
27394 IX86_BUILTIN_VFMADDSUBPS,
27395 IX86_BUILTIN_VFMADDSUBPD,
27396 IX86_BUILTIN_VFMADDSUBPS256,
27397 IX86_BUILTIN_VFMADDSUBPD256,
27399 /* FMA3 instructions. */
27400 IX86_BUILTIN_VFMADDSS3,
27401 IX86_BUILTIN_VFMADDSD3,
27403 /* XOP instructions. */
27404 IX86_BUILTIN_VPCMOV,
27405 IX86_BUILTIN_VPCMOV_V2DI,
27406 IX86_BUILTIN_VPCMOV_V4SI,
27407 IX86_BUILTIN_VPCMOV_V8HI,
27408 IX86_BUILTIN_VPCMOV_V16QI,
27409 IX86_BUILTIN_VPCMOV_V4SF,
27410 IX86_BUILTIN_VPCMOV_V2DF,
27411 IX86_BUILTIN_VPCMOV256,
27412 IX86_BUILTIN_VPCMOV_V4DI256,
27413 IX86_BUILTIN_VPCMOV_V8SI256,
27414 IX86_BUILTIN_VPCMOV_V16HI256,
27415 IX86_BUILTIN_VPCMOV_V32QI256,
27416 IX86_BUILTIN_VPCMOV_V8SF256,
27417 IX86_BUILTIN_VPCMOV_V4DF256,
27419 IX86_BUILTIN_VPPERM,
27421 IX86_BUILTIN_VPMACSSWW,
27422 IX86_BUILTIN_VPMACSWW,
27423 IX86_BUILTIN_VPMACSSWD,
27424 IX86_BUILTIN_VPMACSWD,
27425 IX86_BUILTIN_VPMACSSDD,
27426 IX86_BUILTIN_VPMACSDD,
27427 IX86_BUILTIN_VPMACSSDQL,
27428 IX86_BUILTIN_VPMACSSDQH,
27429 IX86_BUILTIN_VPMACSDQL,
27430 IX86_BUILTIN_VPMACSDQH,
27431 IX86_BUILTIN_VPMADCSSWD,
27432 IX86_BUILTIN_VPMADCSWD,
27434 IX86_BUILTIN_VPHADDBW,
27435 IX86_BUILTIN_VPHADDBD,
27436 IX86_BUILTIN_VPHADDBQ,
27437 IX86_BUILTIN_VPHADDWD,
27438 IX86_BUILTIN_VPHADDWQ,
27439 IX86_BUILTIN_VPHADDDQ,
27440 IX86_BUILTIN_VPHADDUBW,
27441 IX86_BUILTIN_VPHADDUBD,
27442 IX86_BUILTIN_VPHADDUBQ,
27443 IX86_BUILTIN_VPHADDUWD,
27444 IX86_BUILTIN_VPHADDUWQ,
27445 IX86_BUILTIN_VPHADDUDQ,
27446 IX86_BUILTIN_VPHSUBBW,
27447 IX86_BUILTIN_VPHSUBWD,
27448 IX86_BUILTIN_VPHSUBDQ,
27450 IX86_BUILTIN_VPROTB,
27451 IX86_BUILTIN_VPROTW,
27452 IX86_BUILTIN_VPROTD,
27453 IX86_BUILTIN_VPROTQ,
27454 IX86_BUILTIN_VPROTB_IMM,
27455 IX86_BUILTIN_VPROTW_IMM,
27456 IX86_BUILTIN_VPROTD_IMM,
27457 IX86_BUILTIN_VPROTQ_IMM,
27459 IX86_BUILTIN_VPSHLB,
27460 IX86_BUILTIN_VPSHLW,
27461 IX86_BUILTIN_VPSHLD,
27462 IX86_BUILTIN_VPSHLQ,
27463 IX86_BUILTIN_VPSHAB,
27464 IX86_BUILTIN_VPSHAW,
27465 IX86_BUILTIN_VPSHAD,
27466 IX86_BUILTIN_VPSHAQ,
27468 IX86_BUILTIN_VFRCZSS,
27469 IX86_BUILTIN_VFRCZSD,
27470 IX86_BUILTIN_VFRCZPS,
27471 IX86_BUILTIN_VFRCZPD,
27472 IX86_BUILTIN_VFRCZPS256,
27473 IX86_BUILTIN_VFRCZPD256,
27475 IX86_BUILTIN_VPCOMEQUB,
27476 IX86_BUILTIN_VPCOMNEUB,
27477 IX86_BUILTIN_VPCOMLTUB,
27478 IX86_BUILTIN_VPCOMLEUB,
27479 IX86_BUILTIN_VPCOMGTUB,
27480 IX86_BUILTIN_VPCOMGEUB,
27481 IX86_BUILTIN_VPCOMFALSEUB,
27482 IX86_BUILTIN_VPCOMTRUEUB,
27484 IX86_BUILTIN_VPCOMEQUW,
27485 IX86_BUILTIN_VPCOMNEUW,
27486 IX86_BUILTIN_VPCOMLTUW,
27487 IX86_BUILTIN_VPCOMLEUW,
27488 IX86_BUILTIN_VPCOMGTUW,
27489 IX86_BUILTIN_VPCOMGEUW,
27490 IX86_BUILTIN_VPCOMFALSEUW,
27491 IX86_BUILTIN_VPCOMTRUEUW,
27493 IX86_BUILTIN_VPCOMEQUD,
27494 IX86_BUILTIN_VPCOMNEUD,
27495 IX86_BUILTIN_VPCOMLTUD,
27496 IX86_BUILTIN_VPCOMLEUD,
27497 IX86_BUILTIN_VPCOMGTUD,
27498 IX86_BUILTIN_VPCOMGEUD,
27499 IX86_BUILTIN_VPCOMFALSEUD,
27500 IX86_BUILTIN_VPCOMTRUEUD,
27502 IX86_BUILTIN_VPCOMEQUQ,
27503 IX86_BUILTIN_VPCOMNEUQ,
27504 IX86_BUILTIN_VPCOMLTUQ,
27505 IX86_BUILTIN_VPCOMLEUQ,
27506 IX86_BUILTIN_VPCOMGTUQ,
27507 IX86_BUILTIN_VPCOMGEUQ,
27508 IX86_BUILTIN_VPCOMFALSEUQ,
27509 IX86_BUILTIN_VPCOMTRUEUQ,
27511 IX86_BUILTIN_VPCOMEQB,
27512 IX86_BUILTIN_VPCOMNEB,
27513 IX86_BUILTIN_VPCOMLTB,
27514 IX86_BUILTIN_VPCOMLEB,
27515 IX86_BUILTIN_VPCOMGTB,
27516 IX86_BUILTIN_VPCOMGEB,
27517 IX86_BUILTIN_VPCOMFALSEB,
27518 IX86_BUILTIN_VPCOMTRUEB,
27520 IX86_BUILTIN_VPCOMEQW,
27521 IX86_BUILTIN_VPCOMNEW,
27522 IX86_BUILTIN_VPCOMLTW,
27523 IX86_BUILTIN_VPCOMLEW,
27524 IX86_BUILTIN_VPCOMGTW,
27525 IX86_BUILTIN_VPCOMGEW,
27526 IX86_BUILTIN_VPCOMFALSEW,
27527 IX86_BUILTIN_VPCOMTRUEW,
27529 IX86_BUILTIN_VPCOMEQD,
27530 IX86_BUILTIN_VPCOMNED,
27531 IX86_BUILTIN_VPCOMLTD,
27532 IX86_BUILTIN_VPCOMLED,
27533 IX86_BUILTIN_VPCOMGTD,
27534 IX86_BUILTIN_VPCOMGED,
27535 IX86_BUILTIN_VPCOMFALSED,
27536 IX86_BUILTIN_VPCOMTRUED,
27538 IX86_BUILTIN_VPCOMEQQ,
27539 IX86_BUILTIN_VPCOMNEQ,
27540 IX86_BUILTIN_VPCOMLTQ,
27541 IX86_BUILTIN_VPCOMLEQ,
27542 IX86_BUILTIN_VPCOMGTQ,
27543 IX86_BUILTIN_VPCOMGEQ,
27544 IX86_BUILTIN_VPCOMFALSEQ,
27545 IX86_BUILTIN_VPCOMTRUEQ,
27547 /* LWP instructions. */
27548 IX86_BUILTIN_LLWPCB,
27549 IX86_BUILTIN_SLWPCB,
27550 IX86_BUILTIN_LWPVAL32,
27551 IX86_BUILTIN_LWPVAL64,
27552 IX86_BUILTIN_LWPINS32,
27553 IX86_BUILTIN_LWPINS64,
27555 IX86_BUILTIN_CLZS,
27557 /* RTM */
27558 IX86_BUILTIN_XBEGIN,
27559 IX86_BUILTIN_XEND,
27560 IX86_BUILTIN_XABORT,
27561 IX86_BUILTIN_XTEST,
27563 /* BMI instructions. */
27564 IX86_BUILTIN_BEXTR32,
27565 IX86_BUILTIN_BEXTR64,
27566 IX86_BUILTIN_CTZS,
27568 /* TBM instructions. */
27569 IX86_BUILTIN_BEXTRI32,
27570 IX86_BUILTIN_BEXTRI64,
27572 /* BMI2 instructions. */
27573 IX86_BUILTIN_BZHI32,
27574 IX86_BUILTIN_BZHI64,
27575 IX86_BUILTIN_PDEP32,
27576 IX86_BUILTIN_PDEP64,
27577 IX86_BUILTIN_PEXT32,
27578 IX86_BUILTIN_PEXT64,
27580 /* ADX instructions. */
27581 IX86_BUILTIN_ADDCARRYX32,
27582 IX86_BUILTIN_ADDCARRYX64,
27584 /* FSGSBASE instructions. */
27585 IX86_BUILTIN_RDFSBASE32,
27586 IX86_BUILTIN_RDFSBASE64,
27587 IX86_BUILTIN_RDGSBASE32,
27588 IX86_BUILTIN_RDGSBASE64,
27589 IX86_BUILTIN_WRFSBASE32,
27590 IX86_BUILTIN_WRFSBASE64,
27591 IX86_BUILTIN_WRGSBASE32,
27592 IX86_BUILTIN_WRGSBASE64,
27594 /* RDRND instructions. */
27595 IX86_BUILTIN_RDRAND16_STEP,
27596 IX86_BUILTIN_RDRAND32_STEP,
27597 IX86_BUILTIN_RDRAND64_STEP,
27599 /* RDSEED instructions. */
27600 IX86_BUILTIN_RDSEED16_STEP,
27601 IX86_BUILTIN_RDSEED32_STEP,
27602 IX86_BUILTIN_RDSEED64_STEP,
27604 /* F16C instructions. */
27605 IX86_BUILTIN_CVTPH2PS,
27606 IX86_BUILTIN_CVTPH2PS256,
27607 IX86_BUILTIN_CVTPS2PH,
27608 IX86_BUILTIN_CVTPS2PH256,
27610 /* CFString built-in for darwin */
27611 IX86_BUILTIN_CFSTRING,
27613 /* Builtins to get CPU type and supported features. */
27614 IX86_BUILTIN_CPU_INIT,
27615 IX86_BUILTIN_CPU_IS,
27616 IX86_BUILTIN_CPU_SUPPORTS,
27618 IX86_BUILTIN_MAX
27621 /* Table for the ix86 builtin decls. */
27622 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27624 /* Table of all of the builtin functions that are possible with different ISA's
27625 but are waiting to be built until a function is declared to use that
27626 ISA. */
27627 struct builtin_isa {
27628 const char *name; /* function name */
27629 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27630 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27631 bool const_p; /* true if the declaration is constant */
27632 bool set_and_not_built_p;
27635 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27638 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27639 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27640 function decl in the ix86_builtins array. Returns the function decl or
27641 NULL_TREE, if the builtin was not added.
27643 If the front end has a special hook for builtin functions, delay adding
27644 builtin functions that aren't in the current ISA until the ISA is changed
27645 with function specific optimization. Doing so, can save about 300K for the
27646 default compiler. When the builtin is expanded, check at that time whether
27647 it is valid.
27649 If the front end doesn't have a special hook, record all builtins, even if
27650 it isn't an instruction set in the current ISA in case the user uses
27651 function specific options for a different ISA, so that we don't get scope
27652 errors if a builtin is added in the middle of a function scope. */
27654 static inline tree
27655 def_builtin (HOST_WIDE_INT mask, const char *name,
27656 enum ix86_builtin_func_type tcode,
27657 enum ix86_builtins code)
27659 tree decl = NULL_TREE;
27661 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27663 ix86_builtins_isa[(int) code].isa = mask;
27665 mask &= ~OPTION_MASK_ISA_64BIT;
27666 if (mask == 0
27667 || (mask & ix86_isa_flags) != 0
27668 || (lang_hooks.builtin_function
27669 == lang_hooks.builtin_function_ext_scope))
27672 tree type = ix86_get_builtin_func_type (tcode);
27673 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27674 NULL, NULL_TREE);
27675 ix86_builtins[(int) code] = decl;
27676 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27678 else
27680 ix86_builtins[(int) code] = NULL_TREE;
27681 ix86_builtins_isa[(int) code].tcode = tcode;
27682 ix86_builtins_isa[(int) code].name = name;
27683 ix86_builtins_isa[(int) code].const_p = false;
27684 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27688 return decl;
27691 /* Like def_builtin, but also marks the function decl "const". */
27693 static inline tree
27694 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27695 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27697 tree decl = def_builtin (mask, name, tcode, code);
27698 if (decl)
27699 TREE_READONLY (decl) = 1;
27700 else
27701 ix86_builtins_isa[(int) code].const_p = true;
27703 return decl;
27706 /* Add any new builtin functions for a given ISA that may not have been
27707 declared. This saves a bit of space compared to adding all of the
27708 declarations to the tree, even if we didn't use them. */
27710 static void
27711 ix86_add_new_builtins (HOST_WIDE_INT isa)
27713 int i;
27715 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27717 if ((ix86_builtins_isa[i].isa & isa) != 0
27718 && ix86_builtins_isa[i].set_and_not_built_p)
27720 tree decl, type;
27722 /* Don't define the builtin again. */
27723 ix86_builtins_isa[i].set_and_not_built_p = false;
27725 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27726 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27727 type, i, BUILT_IN_MD, NULL,
27728 NULL_TREE);
27730 ix86_builtins[i] = decl;
27731 if (ix86_builtins_isa[i].const_p)
27732 TREE_READONLY (decl) = 1;
27737 /* Bits for builtin_description.flag. */
27739 /* Set when we don't support the comparison natively, and should
27740 swap_comparison in order to support it. */
27741 #define BUILTIN_DESC_SWAP_OPERANDS 1
27743 struct builtin_description
27745 const HOST_WIDE_INT mask;
27746 const enum insn_code icode;
27747 const char *const name;
27748 const enum ix86_builtins code;
27749 const enum rtx_code comparison;
27750 const int flag;
27753 static const struct builtin_description bdesc_comi[] =
27755 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27756 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27757 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27758 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27759 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27760 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27761 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27762 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27763 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27766 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27767 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27768 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27769 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27770 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27771 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27772 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27773 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27774 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27775 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27776 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27777 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27778 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27781 static const struct builtin_description bdesc_pcmpestr[] =
27783 /* SSE4.2 */
27784 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27785 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27786 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27787 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27788 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27789 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27790 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27793 static const struct builtin_description bdesc_pcmpistr[] =
27795 /* SSE4.2 */
27796 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27797 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27798 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27799 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27800 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27801 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27802 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27805 /* Special builtins with variable number of arguments. */
27806 static const struct builtin_description bdesc_special_args[] =
27808 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27809 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27810 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27812 /* MMX */
27813 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27815 /* 3DNow! */
27816 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27818 /* FXSR, XSAVE and XSAVEOPT */
27819 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27820 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27821 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27822 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27823 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27825 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27826 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27827 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27828 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27829 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27831 /* SSE */
27832 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27833 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27834 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27836 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27837 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27838 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27839 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27841 /* SSE or 3DNow!A */
27842 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27843 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27845 /* SSE2 */
27846 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27847 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27848 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27849 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27851 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27852 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27853 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27854 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27860 /* SSE3 */
27861 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27863 /* SSE4.1 */
27864 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27866 /* SSE4A */
27867 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27868 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27870 /* AVX */
27871 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27872 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27874 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27875 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27876 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27877 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
27878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
27880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
27881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
27882 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27883 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
27886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
27888 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
27889 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
27890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
27892 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
27893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
27894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
27895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
27896 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
27897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
27898 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
27899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
27901 /* AVX2 */
27902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
27903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
27904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
27905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
27906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
27907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
27908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
27909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
27910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
27912 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
27913 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
27914 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
27915 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
27916 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
27917 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
27919 /* FSGSBASE */
27920 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27921 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27922 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27923 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
27924 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27925 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27926 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
27927 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
27929 /* RTM */
27930 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
27931 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
27932 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
27935 /* Builtins with variable number of arguments. */
27936 static const struct builtin_description bdesc_args[] =
27938 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27939 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27940 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27941 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27942 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27943 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27944 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27946 /* MMX */
27947 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27948 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27949 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27950 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27951 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27952 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27954 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27955 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27957 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27959 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27960 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27966 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27969 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27972 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27974 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27975 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27979 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27992 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27994 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28001 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28008 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28010 /* 3DNow! */
28011 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28012 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28013 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28014 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28016 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28017 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28018 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28019 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28020 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28021 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28022 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28023 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28024 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28025 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28026 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28027 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28028 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28029 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28030 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28032 /* 3DNow!A */
28033 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28034 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28035 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28036 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28037 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28038 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28040 /* SSE */
28041 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28042 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28043 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28044 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28045 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28046 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28047 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28048 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28049 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28050 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28052 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28054 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28056 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28057 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28058 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28059 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28060 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28061 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28062 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28065 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28071 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28077 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28078 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28079 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28086 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28087 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28091 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28092 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28093 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28094 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28096 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28101 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28102 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28106 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28108 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28114 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28115 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28117 /* SSE MMX or 3Dnow!A */
28118 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28119 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28120 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28122 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28123 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28124 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28125 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28127 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28128 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28130 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28132 /* SSE2 */
28133 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28137 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28138 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28139 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28141 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28143 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28149 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28151 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28152 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28155 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28156 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28158 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28159 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28160 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28161 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28162 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28163 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28164 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28167 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28168 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28169 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28170 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28172 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28175 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28189 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28193 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28195 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28196 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28198 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28201 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28202 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28204 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28206 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28207 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28208 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28209 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28210 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28211 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28212 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28213 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28216 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28221 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28224 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28225 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28227 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28229 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28230 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28242 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28243 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28244 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28247 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28248 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28249 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28250 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28251 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28252 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28253 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28254 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28260 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28264 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28268 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28269 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28274 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28275 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28276 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28277 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28278 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28279 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28282 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28283 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28284 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28285 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28286 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28287 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28289 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28290 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28291 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28292 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28300 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28302 /* SSE2 MMX */
28303 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28304 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28306 /* SSE3 */
28307 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28308 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28310 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28311 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28312 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28313 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28314 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28315 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28317 /* SSSE3 */
28318 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28319 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28320 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28321 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28322 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28323 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28325 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28326 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28327 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28328 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28329 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28330 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28331 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28332 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28333 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28334 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28335 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28337 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28338 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28339 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28340 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28341 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28348 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28350 /* SSSE3. */
28351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28354 /* SSE4.1 */
28355 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28356 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28357 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28358 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28359 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28360 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28361 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28362 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28363 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28364 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28366 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28367 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28368 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28369 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28371 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28374 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28376 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28377 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28378 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28380 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28385 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28388 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28389 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28390 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28393 /* SSE4.1 */
28394 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28395 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28396 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28397 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28399 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28400 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28401 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28402 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28404 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28405 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28407 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28408 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28410 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28411 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28412 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28413 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28415 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28416 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28418 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28419 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28421 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28422 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28423 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28425 /* SSE4.2 */
28426 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28427 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28428 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28429 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28430 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28432 /* SSE4A */
28433 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28434 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28435 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28436 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28438 /* AES */
28439 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28440 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28442 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28443 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28444 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28445 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28447 /* PCLMUL */
28448 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28450 /* AVX */
28451 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28452 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28453 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28454 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28455 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28456 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28457 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28458 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28459 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28460 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28461 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28462 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28465 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28466 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28467 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28468 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28469 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28470 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28471 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28472 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28473 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28474 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28475 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28476 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28478 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28479 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28480 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28481 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28483 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28484 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28485 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28486 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28488 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28489 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28490 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28491 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28492 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28499 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28500 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28504 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28505 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28506 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28510 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28515 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28522 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28524 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28526 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28538 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28539 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28552 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28553 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28563 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28564 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28565 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28570 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28586 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28587 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28589 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28591 /* AVX2 */
28592 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28593 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28594 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28595 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28596 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28597 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28598 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28599 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28600 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28601 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28602 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28603 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28604 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28606 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28609 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28610 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28611 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28612 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28615 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28618 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28631 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28632 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28633 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28634 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28635 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28636 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28637 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28638 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28639 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28640 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28641 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28642 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28643 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28644 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28645 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28646 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28647 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28648 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28656 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28658 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28659 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28660 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28661 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28662 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28663 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28673 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28674 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28675 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28676 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28677 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28678 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28679 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28680 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28681 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28682 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28684 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28685 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28686 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28687 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28688 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28689 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28690 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28691 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28692 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28693 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28695 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28696 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28697 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28698 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28699 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28700 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28701 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28702 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28706 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28739 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28741 /* BMI */
28742 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28743 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28744 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28746 /* TBM */
28747 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28748 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28750 /* F16C */
28751 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28752 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28753 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28754 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28756 /* BMI2 */
28757 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28758 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28759 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28760 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28761 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28762 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28765 /* FMA4 and XOP. */
28766 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28767 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28768 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28769 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28770 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28771 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28772 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28773 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28774 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28775 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28776 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28777 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28778 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28779 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28780 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28781 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28782 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28783 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28784 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28785 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28786 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28787 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28788 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28789 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28790 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28791 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28792 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28793 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28794 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28795 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28796 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28797 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28798 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28799 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28800 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28801 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28802 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28803 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28804 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28805 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28806 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28807 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28808 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28809 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28810 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28811 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28812 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28813 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28814 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28815 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28816 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28817 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28819 static const struct builtin_description bdesc_multi_arg[] =
28821 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28822 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28823 UNKNOWN, (int)MULTI_ARG_3_SF },
28824 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28825 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28826 UNKNOWN, (int)MULTI_ARG_3_DF },
28828 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28829 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28830 UNKNOWN, (int)MULTI_ARG_3_SF },
28831 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28832 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28833 UNKNOWN, (int)MULTI_ARG_3_DF },
28835 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28836 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28837 UNKNOWN, (int)MULTI_ARG_3_SF },
28838 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28839 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28840 UNKNOWN, (int)MULTI_ARG_3_DF },
28841 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28842 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28843 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28844 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28845 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28846 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28848 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28849 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28850 UNKNOWN, (int)MULTI_ARG_3_SF },
28851 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28852 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28853 UNKNOWN, (int)MULTI_ARG_3_DF },
28854 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28855 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28856 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28857 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28858 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28859 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28861 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28862 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28863 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28864 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28865 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28866 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28867 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28869 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
28875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
28877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
28879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
28881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
28885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
28889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
28892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
28894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
28895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
28896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
28897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
28898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
28899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
28900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
28902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
28903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
28904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
28905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
28906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
28907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
28909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
28910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
28911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
28912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
28913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
28914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
28916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
28924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
28925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
28927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
28929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
28930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
28932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
28933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
28935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29021 /* TM vector builtins. */
29023 /* Reuse the existing x86-specific `struct builtin_description' cause
29024 we're lazy. Add casts to make them fit. */
29025 static const struct builtin_description bdesc_tm[] =
29027 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29028 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29029 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29030 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29031 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29032 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29033 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29035 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29036 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29037 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29038 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29039 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29040 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29041 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29043 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29044 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29045 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29046 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29047 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29048 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29049 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29051 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29052 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29053 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29056 /* TM callbacks. */
29058 /* Return the builtin decl needed to load a vector of TYPE. */
29060 static tree
29061 ix86_builtin_tm_load (tree type)
29063 if (TREE_CODE (type) == VECTOR_TYPE)
29065 switch (tree_low_cst (TYPE_SIZE (type), 1))
29067 case 64:
29068 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29069 case 128:
29070 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29071 case 256:
29072 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29075 return NULL_TREE;
29078 /* Return the builtin decl needed to store a vector of TYPE. */
29080 static tree
29081 ix86_builtin_tm_store (tree type)
29083 if (TREE_CODE (type) == VECTOR_TYPE)
29085 switch (tree_low_cst (TYPE_SIZE (type), 1))
29087 case 64:
29088 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29089 case 128:
29090 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29091 case 256:
29092 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29095 return NULL_TREE;
29098 /* Initialize the transactional memory vector load/store builtins. */
29100 static void
29101 ix86_init_tm_builtins (void)
29103 enum ix86_builtin_func_type ftype;
29104 const struct builtin_description *d;
29105 size_t i;
29106 tree decl;
29107 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29108 tree attrs_log, attrs_type_log;
29110 if (!flag_tm)
29111 return;
29113 /* If there are no builtins defined, we must be compiling in a
29114 language without trans-mem support. */
29115 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29116 return;
29118 /* Use whatever attributes a normal TM load has. */
29119 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29120 attrs_load = DECL_ATTRIBUTES (decl);
29121 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29122 /* Use whatever attributes a normal TM store has. */
29123 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29124 attrs_store = DECL_ATTRIBUTES (decl);
29125 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29126 /* Use whatever attributes a normal TM log has. */
29127 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29128 attrs_log = DECL_ATTRIBUTES (decl);
29129 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29131 for (i = 0, d = bdesc_tm;
29132 i < ARRAY_SIZE (bdesc_tm);
29133 i++, d++)
29135 if ((d->mask & ix86_isa_flags) != 0
29136 || (lang_hooks.builtin_function
29137 == lang_hooks.builtin_function_ext_scope))
29139 tree type, attrs, attrs_type;
29140 enum built_in_function code = (enum built_in_function) d->code;
29142 ftype = (enum ix86_builtin_func_type) d->flag;
29143 type = ix86_get_builtin_func_type (ftype);
29145 if (BUILTIN_TM_LOAD_P (code))
29147 attrs = attrs_load;
29148 attrs_type = attrs_type_load;
29150 else if (BUILTIN_TM_STORE_P (code))
29152 attrs = attrs_store;
29153 attrs_type = attrs_type_store;
29155 else
29157 attrs = attrs_log;
29158 attrs_type = attrs_type_log;
29160 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29161 /* The builtin without the prefix for
29162 calling it directly. */
29163 d->name + strlen ("__builtin_"),
29164 attrs);
29165 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29166 set the TYPE_ATTRIBUTES. */
29167 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29169 set_builtin_decl (code, decl, false);
29174 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29175 in the current target ISA to allow the user to compile particular modules
29176 with different target specific options that differ from the command line
29177 options. */
29178 static void
29179 ix86_init_mmx_sse_builtins (void)
29181 const struct builtin_description * d;
29182 enum ix86_builtin_func_type ftype;
29183 size_t i;
29185 /* Add all special builtins with variable number of operands. */
29186 for (i = 0, d = bdesc_special_args;
29187 i < ARRAY_SIZE (bdesc_special_args);
29188 i++, d++)
29190 if (d->name == 0)
29191 continue;
29193 ftype = (enum ix86_builtin_func_type) d->flag;
29194 def_builtin (d->mask, d->name, ftype, d->code);
29197 /* Add all builtins with variable number of operands. */
29198 for (i = 0, d = bdesc_args;
29199 i < ARRAY_SIZE (bdesc_args);
29200 i++, d++)
29202 if (d->name == 0)
29203 continue;
29205 ftype = (enum ix86_builtin_func_type) d->flag;
29206 def_builtin_const (d->mask, d->name, ftype, d->code);
29209 /* pcmpestr[im] insns. */
29210 for (i = 0, d = bdesc_pcmpestr;
29211 i < ARRAY_SIZE (bdesc_pcmpestr);
29212 i++, d++)
29214 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29215 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29216 else
29217 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29218 def_builtin_const (d->mask, d->name, ftype, d->code);
29221 /* pcmpistr[im] insns. */
29222 for (i = 0, d = bdesc_pcmpistr;
29223 i < ARRAY_SIZE (bdesc_pcmpistr);
29224 i++, d++)
29226 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29227 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29228 else
29229 ftype = INT_FTYPE_V16QI_V16QI_INT;
29230 def_builtin_const (d->mask, d->name, ftype, d->code);
29233 /* comi/ucomi insns. */
29234 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29236 if (d->mask == OPTION_MASK_ISA_SSE2)
29237 ftype = INT_FTYPE_V2DF_V2DF;
29238 else
29239 ftype = INT_FTYPE_V4SF_V4SF;
29240 def_builtin_const (d->mask, d->name, ftype, d->code);
29243 /* SSE */
29244 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29245 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29246 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29247 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29249 /* SSE or 3DNow!A */
29250 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29251 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29252 IX86_BUILTIN_MASKMOVQ);
29254 /* SSE2 */
29255 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29256 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29258 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29259 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29260 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29261 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29263 /* SSE3. */
29264 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29265 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29266 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29267 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29269 /* AES */
29270 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29271 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29272 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29273 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29274 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29275 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29276 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29277 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29278 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29279 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29280 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29281 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29283 /* PCLMUL */
29284 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29285 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29287 /* RDRND */
29288 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29289 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29290 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29291 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29292 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29293 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29294 IX86_BUILTIN_RDRAND64_STEP);
29296 /* AVX2 */
29297 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29298 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29299 IX86_BUILTIN_GATHERSIV2DF);
29301 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29302 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29303 IX86_BUILTIN_GATHERSIV4DF);
29305 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29306 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29307 IX86_BUILTIN_GATHERDIV2DF);
29309 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29310 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29311 IX86_BUILTIN_GATHERDIV4DF);
29313 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29314 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29315 IX86_BUILTIN_GATHERSIV4SF);
29317 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29318 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29319 IX86_BUILTIN_GATHERSIV8SF);
29321 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29322 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29323 IX86_BUILTIN_GATHERDIV4SF);
29325 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29326 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29327 IX86_BUILTIN_GATHERDIV8SF);
29329 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29330 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29331 IX86_BUILTIN_GATHERSIV2DI);
29333 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29334 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29335 IX86_BUILTIN_GATHERSIV4DI);
29337 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29338 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29339 IX86_BUILTIN_GATHERDIV2DI);
29341 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29342 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29343 IX86_BUILTIN_GATHERDIV4DI);
29345 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29346 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29347 IX86_BUILTIN_GATHERSIV4SI);
29349 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29350 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29351 IX86_BUILTIN_GATHERSIV8SI);
29353 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29354 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29355 IX86_BUILTIN_GATHERDIV4SI);
29357 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29358 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29359 IX86_BUILTIN_GATHERDIV8SI);
29361 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29362 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29363 IX86_BUILTIN_GATHERALTSIV4DF);
29365 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29366 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29367 IX86_BUILTIN_GATHERALTDIV8SF);
29369 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29370 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29371 IX86_BUILTIN_GATHERALTSIV4DI);
29373 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29374 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29375 IX86_BUILTIN_GATHERALTDIV8SI);
29377 /* RTM. */
29378 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29379 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29381 /* MMX access to the vec_init patterns. */
29382 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29383 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29385 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29386 V4HI_FTYPE_HI_HI_HI_HI,
29387 IX86_BUILTIN_VEC_INIT_V4HI);
29389 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29390 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29391 IX86_BUILTIN_VEC_INIT_V8QI);
29393 /* Access to the vec_extract patterns. */
29394 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29395 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29396 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29397 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29398 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29399 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29400 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29401 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29402 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29403 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29405 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29406 "__builtin_ia32_vec_ext_v4hi",
29407 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29409 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29410 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29412 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29413 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29415 /* Access to the vec_set patterns. */
29416 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29417 "__builtin_ia32_vec_set_v2di",
29418 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29420 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29421 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29423 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29424 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29426 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29427 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29429 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29430 "__builtin_ia32_vec_set_v4hi",
29431 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29433 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29434 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29436 /* RDSEED */
29437 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29438 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29439 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29440 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29441 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29442 "__builtin_ia32_rdseed_di_step",
29443 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29445 /* ADCX */
29446 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29447 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29448 def_builtin (OPTION_MASK_ISA_64BIT,
29449 "__builtin_ia32_addcarryx_u64",
29450 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29451 IX86_BUILTIN_ADDCARRYX64);
29453 /* Add FMA4 multi-arg argument instructions */
29454 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29456 if (d->name == 0)
29457 continue;
29459 ftype = (enum ix86_builtin_func_type) d->flag;
29460 def_builtin_const (d->mask, d->name, ftype, d->code);
29464 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29465 to return a pointer to VERSION_DECL if the outcome of the expression
29466 formed by PREDICATE_CHAIN is true. This function will be called during
29467 version dispatch to decide which function version to execute. It returns
29468 the basic block at the end, to which more conditions can be added. */
29470 static basic_block
29471 add_condition_to_bb (tree function_decl, tree version_decl,
29472 tree predicate_chain, basic_block new_bb)
29474 gimple return_stmt;
29475 tree convert_expr, result_var;
29476 gimple convert_stmt;
29477 gimple call_cond_stmt;
29478 gimple if_else_stmt;
29480 basic_block bb1, bb2, bb3;
29481 edge e12, e23;
29483 tree cond_var, and_expr_var = NULL_TREE;
29484 gimple_seq gseq;
29486 tree predicate_decl, predicate_arg;
29488 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29490 gcc_assert (new_bb != NULL);
29491 gseq = bb_seq (new_bb);
29494 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29495 build_fold_addr_expr (version_decl));
29496 result_var = create_tmp_var (ptr_type_node, NULL);
29497 convert_stmt = gimple_build_assign (result_var, convert_expr);
29498 return_stmt = gimple_build_return (result_var);
29500 if (predicate_chain == NULL_TREE)
29502 gimple_seq_add_stmt (&gseq, convert_stmt);
29503 gimple_seq_add_stmt (&gseq, return_stmt);
29504 set_bb_seq (new_bb, gseq);
29505 gimple_set_bb (convert_stmt, new_bb);
29506 gimple_set_bb (return_stmt, new_bb);
29507 pop_cfun ();
29508 return new_bb;
29511 while (predicate_chain != NULL)
29513 cond_var = create_tmp_var (integer_type_node, NULL);
29514 predicate_decl = TREE_PURPOSE (predicate_chain);
29515 predicate_arg = TREE_VALUE (predicate_chain);
29516 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29517 gimple_call_set_lhs (call_cond_stmt, cond_var);
29519 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29520 gimple_set_bb (call_cond_stmt, new_bb);
29521 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29523 predicate_chain = TREE_CHAIN (predicate_chain);
29525 if (and_expr_var == NULL)
29526 and_expr_var = cond_var;
29527 else
29529 gimple assign_stmt;
29530 /* Use MIN_EXPR to check if any integer is zero?.
29531 and_expr_var = min_expr <cond_var, and_expr_var> */
29532 assign_stmt = gimple_build_assign (and_expr_var,
29533 build2 (MIN_EXPR, integer_type_node,
29534 cond_var, and_expr_var));
29536 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29537 gimple_set_bb (assign_stmt, new_bb);
29538 gimple_seq_add_stmt (&gseq, assign_stmt);
29542 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29543 integer_zero_node,
29544 NULL_TREE, NULL_TREE);
29545 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29546 gimple_set_bb (if_else_stmt, new_bb);
29547 gimple_seq_add_stmt (&gseq, if_else_stmt);
29549 gimple_seq_add_stmt (&gseq, convert_stmt);
29550 gimple_seq_add_stmt (&gseq, return_stmt);
29551 set_bb_seq (new_bb, gseq);
29553 bb1 = new_bb;
29554 e12 = split_block (bb1, if_else_stmt);
29555 bb2 = e12->dest;
29556 e12->flags &= ~EDGE_FALLTHRU;
29557 e12->flags |= EDGE_TRUE_VALUE;
29559 e23 = split_block (bb2, return_stmt);
29561 gimple_set_bb (convert_stmt, bb2);
29562 gimple_set_bb (return_stmt, bb2);
29564 bb3 = e23->dest;
29565 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29567 remove_edge (e23);
29568 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29570 pop_cfun ();
29572 return bb3;
29575 /* This parses the attribute arguments to target in DECL and determines
29576 the right builtin to use to match the platform specification.
29577 It returns the priority value for this version decl. If PREDICATE_LIST
29578 is not NULL, it stores the list of cpu features that need to be checked
29579 before dispatching this function. */
29581 static unsigned int
29582 get_builtin_code_for_version (tree decl, tree *predicate_list)
29584 tree attrs;
29585 struct cl_target_option cur_target;
29586 tree target_node;
29587 struct cl_target_option *new_target;
29588 const char *arg_str = NULL;
29589 const char *attrs_str = NULL;
29590 char *tok_str = NULL;
29591 char *token;
29593 /* Priority of i386 features, greater value is higher priority. This is
29594 used to decide the order in which function dispatch must happen. For
29595 instance, a version specialized for SSE4.2 should be checked for dispatch
29596 before a version for SSE3, as SSE4.2 implies SSE3. */
29597 enum feature_priority
29599 P_ZERO = 0,
29600 P_MMX,
29601 P_SSE,
29602 P_SSE2,
29603 P_SSE3,
29604 P_SSSE3,
29605 P_PROC_SSSE3,
29606 P_SSE4_a,
29607 P_PROC_SSE4_a,
29608 P_SSE4_1,
29609 P_SSE4_2,
29610 P_PROC_SSE4_2,
29611 P_POPCNT,
29612 P_AVX,
29613 P_AVX2,
29614 P_FMA,
29615 P_PROC_FMA
29618 enum feature_priority priority = P_ZERO;
29620 /* These are the target attribute strings for which a dispatcher is
29621 available, from fold_builtin_cpu. */
29623 static struct _feature_list
29625 const char *const name;
29626 const enum feature_priority priority;
29628 const feature_list[] =
29630 {"mmx", P_MMX},
29631 {"sse", P_SSE},
29632 {"sse2", P_SSE2},
29633 {"sse3", P_SSE3},
29634 {"ssse3", P_SSSE3},
29635 {"sse4.1", P_SSE4_1},
29636 {"sse4.2", P_SSE4_2},
29637 {"popcnt", P_POPCNT},
29638 {"avx", P_AVX},
29639 {"avx2", P_AVX2}
29643 static unsigned int NUM_FEATURES
29644 = sizeof (feature_list) / sizeof (struct _feature_list);
29646 unsigned int i;
29648 tree predicate_chain = NULL_TREE;
29649 tree predicate_decl, predicate_arg;
29651 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29652 gcc_assert (attrs != NULL);
29654 attrs = TREE_VALUE (TREE_VALUE (attrs));
29656 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29657 attrs_str = TREE_STRING_POINTER (attrs);
29659 /* Return priority zero for default function. */
29660 if (strcmp (attrs_str, "default") == 0)
29661 return 0;
29663 /* Handle arch= if specified. For priority, set it to be 1 more than
29664 the best instruction set the processor can handle. For instance, if
29665 there is a version for atom and a version for ssse3 (the highest ISA
29666 priority for atom), the atom version must be checked for dispatch
29667 before the ssse3 version. */
29668 if (strstr (attrs_str, "arch=") != NULL)
29670 cl_target_option_save (&cur_target, &global_options);
29671 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29672 &global_options_set);
29674 gcc_assert (target_node);
29675 new_target = TREE_TARGET_OPTION (target_node);
29676 gcc_assert (new_target);
29678 if (new_target->arch_specified && new_target->arch > 0)
29680 switch (new_target->arch)
29682 case PROCESSOR_CORE2:
29683 arg_str = "core2";
29684 priority = P_PROC_SSSE3;
29685 break;
29686 case PROCESSOR_COREI7:
29687 arg_str = "corei7";
29688 priority = P_PROC_SSE4_2;
29689 break;
29690 case PROCESSOR_ATOM:
29691 arg_str = "atom";
29692 priority = P_PROC_SSSE3;
29693 break;
29694 case PROCESSOR_AMDFAM10:
29695 arg_str = "amdfam10h";
29696 priority = P_PROC_SSE4_a;
29697 break;
29698 case PROCESSOR_BDVER1:
29699 arg_str = "bdver1";
29700 priority = P_PROC_FMA;
29701 break;
29702 case PROCESSOR_BDVER2:
29703 arg_str = "bdver2";
29704 priority = P_PROC_FMA;
29705 break;
29709 cl_target_option_restore (&global_options, &cur_target);
29711 if (predicate_list && arg_str == NULL)
29713 error_at (DECL_SOURCE_LOCATION (decl),
29714 "No dispatcher found for the versioning attributes");
29715 return 0;
29718 if (predicate_list)
29720 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29721 /* For a C string literal the length includes the trailing NULL. */
29722 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29723 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29724 predicate_chain);
29728 /* Process feature name. */
29729 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29730 strcpy (tok_str, attrs_str);
29731 token = strtok (tok_str, ",");
29732 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29734 while (token != NULL)
29736 /* Do not process "arch=" */
29737 if (strncmp (token, "arch=", 5) == 0)
29739 token = strtok (NULL, ",");
29740 continue;
29742 for (i = 0; i < NUM_FEATURES; ++i)
29744 if (strcmp (token, feature_list[i].name) == 0)
29746 if (predicate_list)
29748 predicate_arg = build_string_literal (
29749 strlen (feature_list[i].name) + 1,
29750 feature_list[i].name);
29751 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29752 predicate_chain);
29754 /* Find the maximum priority feature. */
29755 if (feature_list[i].priority > priority)
29756 priority = feature_list[i].priority;
29758 break;
29761 if (predicate_list && i == NUM_FEATURES)
29763 error_at (DECL_SOURCE_LOCATION (decl),
29764 "No dispatcher found for %s", token);
29765 return 0;
29767 token = strtok (NULL, ",");
29769 free (tok_str);
29771 if (predicate_list && predicate_chain == NULL_TREE)
29773 error_at (DECL_SOURCE_LOCATION (decl),
29774 "No dispatcher found for the versioning attributes : %s",
29775 attrs_str);
29776 return 0;
29778 else if (predicate_list)
29780 predicate_chain = nreverse (predicate_chain);
29781 *predicate_list = predicate_chain;
29784 return priority;
29787 /* This compares the priority of target features in function DECL1
29788 and DECL2. It returns positive value if DECL1 is higher priority,
29789 negative value if DECL2 is higher priority and 0 if they are the
29790 same. */
29792 static int
29793 ix86_compare_version_priority (tree decl1, tree decl2)
29795 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29796 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29798 return (int)priority1 - (int)priority2;
29801 /* V1 and V2 point to function versions with different priorities
29802 based on the target ISA. This function compares their priorities. */
29804 static int
29805 feature_compare (const void *v1, const void *v2)
29807 typedef struct _function_version_info
29809 tree version_decl;
29810 tree predicate_chain;
29811 unsigned int dispatch_priority;
29812 } function_version_info;
29814 const function_version_info c1 = *(const function_version_info *)v1;
29815 const function_version_info c2 = *(const function_version_info *)v2;
29816 return (c2.dispatch_priority - c1.dispatch_priority);
29819 /* This function generates the dispatch function for
29820 multi-versioned functions. DISPATCH_DECL is the function which will
29821 contain the dispatch logic. FNDECLS are the function choices for
29822 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29823 in DISPATCH_DECL in which the dispatch code is generated. */
29825 static int
29826 dispatch_function_versions (tree dispatch_decl,
29827 void *fndecls_p,
29828 basic_block *empty_bb)
29830 tree default_decl;
29831 gimple ifunc_cpu_init_stmt;
29832 gimple_seq gseq;
29833 int ix;
29834 tree ele;
29835 vec<tree> *fndecls;
29836 unsigned int num_versions = 0;
29837 unsigned int actual_versions = 0;
29838 unsigned int i;
29840 struct _function_version_info
29842 tree version_decl;
29843 tree predicate_chain;
29844 unsigned int dispatch_priority;
29845 }*function_version_info;
29847 gcc_assert (dispatch_decl != NULL
29848 && fndecls_p != NULL
29849 && empty_bb != NULL);
29851 /*fndecls_p is actually a vector. */
29852 fndecls = static_cast<vec<tree> *> (fndecls_p);
29854 /* At least one more version other than the default. */
29855 num_versions = fndecls->length ();
29856 gcc_assert (num_versions >= 2);
29858 function_version_info = (struct _function_version_info *)
29859 XNEWVEC (struct _function_version_info, (num_versions - 1));
29861 /* The first version in the vector is the default decl. */
29862 default_decl = (*fndecls)[0];
29864 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29866 gseq = bb_seq (*empty_bb);
29867 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29868 constructors, so explicity call __builtin_cpu_init here. */
29869 ifunc_cpu_init_stmt = gimple_build_call_vec (
29870 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
29871 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
29872 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
29873 set_bb_seq (*empty_bb, gseq);
29875 pop_cfun ();
29878 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
29880 tree version_decl = ele;
29881 tree predicate_chain = NULL_TREE;
29882 unsigned int priority;
29883 /* Get attribute string, parse it and find the right predicate decl.
29884 The predicate function could be a lengthy combination of many
29885 features, like arch-type and various isa-variants. */
29886 priority = get_builtin_code_for_version (version_decl,
29887 &predicate_chain);
29889 if (predicate_chain == NULL_TREE)
29890 continue;
29892 function_version_info [actual_versions].version_decl = version_decl;
29893 function_version_info [actual_versions].predicate_chain
29894 = predicate_chain;
29895 function_version_info [actual_versions].dispatch_priority = priority;
29896 actual_versions++;
29899 /* Sort the versions according to descending order of dispatch priority. The
29900 priority is based on the ISA. This is not a perfect solution. There
29901 could still be ambiguity. If more than one function version is suitable
29902 to execute, which one should be dispatched? In future, allow the user
29903 to specify a dispatch priority next to the version. */
29904 qsort (function_version_info, actual_versions,
29905 sizeof (struct _function_version_info), feature_compare);
29907 for (i = 0; i < actual_versions; ++i)
29908 *empty_bb = add_condition_to_bb (dispatch_decl,
29909 function_version_info[i].version_decl,
29910 function_version_info[i].predicate_chain,
29911 *empty_bb);
29913 /* dispatch default version at the end. */
29914 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
29915 NULL, *empty_bb);
29917 free (function_version_info);
29918 return 0;
29921 /* Comparator function to be used in qsort routine to sort attribute
29922 specification strings to "target". */
29924 static int
29925 attr_strcmp (const void *v1, const void *v2)
29927 const char *c1 = *(char *const*)v1;
29928 const char *c2 = *(char *const*)v2;
29929 return strcmp (c1, c2);
29932 /* ARGLIST is the argument to target attribute. This function tokenizes
29933 the comma separated arguments, sorts them and returns a string which
29934 is a unique identifier for the comma separated arguments. It also
29935 replaces non-identifier characters "=,-" with "_". */
29937 static char *
29938 sorted_attr_string (tree arglist)
29940 tree arg;
29941 size_t str_len_sum = 0;
29942 char **args = NULL;
29943 char *attr_str, *ret_str;
29944 char *attr = NULL;
29945 unsigned int argnum = 1;
29946 unsigned int i;
29948 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29950 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29951 size_t len = strlen (str);
29952 str_len_sum += len + 1;
29953 if (arg != arglist)
29954 argnum++;
29955 for (i = 0; i < strlen (str); i++)
29956 if (str[i] == ',')
29957 argnum++;
29960 attr_str = XNEWVEC (char, str_len_sum);
29961 str_len_sum = 0;
29962 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29964 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29965 size_t len = strlen (str);
29966 memcpy (attr_str + str_len_sum, str, len);
29967 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29968 str_len_sum += len + 1;
29971 /* Replace "=,-" with "_". */
29972 for (i = 0; i < strlen (attr_str); i++)
29973 if (attr_str[i] == '=' || attr_str[i]== '-')
29974 attr_str[i] = '_';
29976 if (argnum == 1)
29977 return attr_str;
29979 args = XNEWVEC (char *, argnum);
29981 i = 0;
29982 attr = strtok (attr_str, ",");
29983 while (attr != NULL)
29985 args[i] = attr;
29986 i++;
29987 attr = strtok (NULL, ",");
29990 qsort (args, argnum, sizeof (char *), attr_strcmp);
29992 ret_str = XNEWVEC (char, str_len_sum);
29993 str_len_sum = 0;
29994 for (i = 0; i < argnum; i++)
29996 size_t len = strlen (args[i]);
29997 memcpy (ret_str + str_len_sum, args[i], len);
29998 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29999 str_len_sum += len + 1;
30002 XDELETEVEC (args);
30003 XDELETEVEC (attr_str);
30004 return ret_str;
30007 /* This function changes the assembler name for functions that are
30008 versions. If DECL is a function version and has a "target"
30009 attribute, it appends the attribute string to its assembler name. */
30011 static tree
30012 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30014 tree version_attr;
30015 const char *orig_name, *version_string;
30016 char *attr_str, *assembler_name;
30018 if (DECL_DECLARED_INLINE_P (decl)
30019 && lookup_attribute ("gnu_inline",
30020 DECL_ATTRIBUTES (decl)))
30021 error_at (DECL_SOURCE_LOCATION (decl),
30022 "Function versions cannot be marked as gnu_inline,"
30023 " bodies have to be generated");
30025 if (DECL_VIRTUAL_P (decl)
30026 || DECL_VINDEX (decl))
30027 sorry ("Virtual function multiversioning not supported");
30029 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30031 /* target attribute string cannot be NULL. */
30032 gcc_assert (version_attr != NULL_TREE);
30034 orig_name = IDENTIFIER_POINTER (id);
30035 version_string
30036 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30038 if (strcmp (version_string, "default") == 0)
30039 return id;
30041 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30042 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30044 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30046 /* Allow assembler name to be modified if already set. */
30047 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30048 SET_DECL_RTL (decl, NULL);
30050 tree ret = get_identifier (assembler_name);
30051 XDELETEVEC (attr_str);
30052 XDELETEVEC (assembler_name);
30053 return ret;
30056 /* This function returns true if FN1 and FN2 are versions of the same function,
30057 that is, the target strings of the function decls are different. This assumes
30058 that FN1 and FN2 have the same signature. */
30060 static bool
30061 ix86_function_versions (tree fn1, tree fn2)
30063 tree attr1, attr2;
30064 char *target1, *target2;
30065 bool result;
30067 if (TREE_CODE (fn1) != FUNCTION_DECL
30068 || TREE_CODE (fn2) != FUNCTION_DECL)
30069 return false;
30071 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30072 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30074 /* At least one function decl should have the target attribute specified. */
30075 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30076 return false;
30078 /* Diagnose missing target attribute if one of the decls is already
30079 multi-versioned. */
30080 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30082 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30084 if (attr2 != NULL_TREE)
30086 tree tem = fn1;
30087 fn1 = fn2;
30088 fn2 = tem;
30089 attr1 = attr2;
30091 error_at (DECL_SOURCE_LOCATION (fn2),
30092 "missing %<target%> attribute for multi-versioned %D",
30093 fn2);
30094 inform (DECL_SOURCE_LOCATION (fn1),
30095 "previous declaration of %D", fn1);
30096 /* Prevent diagnosing of the same error multiple times. */
30097 DECL_ATTRIBUTES (fn2)
30098 = tree_cons (get_identifier ("target"),
30099 copy_node (TREE_VALUE (attr1)),
30100 DECL_ATTRIBUTES (fn2));
30102 return false;
30105 target1 = sorted_attr_string (TREE_VALUE (attr1));
30106 target2 = sorted_attr_string (TREE_VALUE (attr2));
30108 /* The sorted target strings must be different for fn1 and fn2
30109 to be versions. */
30110 if (strcmp (target1, target2) == 0)
30111 result = false;
30112 else
30113 result = true;
30115 XDELETEVEC (target1);
30116 XDELETEVEC (target2);
30118 return result;
30121 static tree
30122 ix86_mangle_decl_assembler_name (tree decl, tree id)
30124 /* For function version, add the target suffix to the assembler name. */
30125 if (TREE_CODE (decl) == FUNCTION_DECL
30126 && DECL_FUNCTION_VERSIONED (decl))
30127 id = ix86_mangle_function_version_assembler_name (decl, id);
30128 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30129 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30130 #endif
30132 return id;
30135 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30136 is true, append the full path name of the source file. */
30138 static char *
30139 make_name (tree decl, const char *suffix, bool make_unique)
30141 char *global_var_name;
30142 int name_len;
30143 const char *name;
30144 const char *unique_name = NULL;
30146 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30148 /* Get a unique name that can be used globally without any chances
30149 of collision at link time. */
30150 if (make_unique)
30151 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30153 name_len = strlen (name) + strlen (suffix) + 2;
30155 if (make_unique)
30156 name_len += strlen (unique_name) + 1;
30157 global_var_name = XNEWVEC (char, name_len);
30159 /* Use '.' to concatenate names as it is demangler friendly. */
30160 if (make_unique)
30161 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30162 suffix);
30163 else
30164 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30166 return global_var_name;
30169 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30171 /* Make a dispatcher declaration for the multi-versioned function DECL.
30172 Calls to DECL function will be replaced with calls to the dispatcher
30173 by the front-end. Return the decl created. */
30175 static tree
30176 make_dispatcher_decl (const tree decl)
30178 tree func_decl;
30179 char *func_name;
30180 tree fn_type, func_type;
30181 bool is_uniq = false;
30183 if (TREE_PUBLIC (decl) == 0)
30184 is_uniq = true;
30186 func_name = make_name (decl, "ifunc", is_uniq);
30188 fn_type = TREE_TYPE (decl);
30189 func_type = build_function_type (TREE_TYPE (fn_type),
30190 TYPE_ARG_TYPES (fn_type));
30192 func_decl = build_fn_decl (func_name, func_type);
30193 XDELETEVEC (func_name);
30194 TREE_USED (func_decl) = 1;
30195 DECL_CONTEXT (func_decl) = NULL_TREE;
30196 DECL_INITIAL (func_decl) = error_mark_node;
30197 DECL_ARTIFICIAL (func_decl) = 1;
30198 /* Mark this func as external, the resolver will flip it again if
30199 it gets generated. */
30200 DECL_EXTERNAL (func_decl) = 1;
30201 /* This will be of type IFUNCs have to be externally visible. */
30202 TREE_PUBLIC (func_decl) = 1;
30204 return func_decl;
30207 #endif
30209 /* Returns true if decl is multi-versioned and DECL is the default function,
30210 that is it is not tagged with target specific optimization. */
30212 static bool
30213 is_function_default_version (const tree decl)
30215 if (TREE_CODE (decl) != FUNCTION_DECL
30216 || !DECL_FUNCTION_VERSIONED (decl))
30217 return false;
30218 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30219 gcc_assert (attr);
30220 attr = TREE_VALUE (TREE_VALUE (attr));
30221 return (TREE_CODE (attr) == STRING_CST
30222 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30225 /* Make a dispatcher declaration for the multi-versioned function DECL.
30226 Calls to DECL function will be replaced with calls to the dispatcher
30227 by the front-end. Returns the decl of the dispatcher function. */
30229 static tree
30230 ix86_get_function_versions_dispatcher (void *decl)
30232 tree fn = (tree) decl;
30233 struct cgraph_node *node = NULL;
30234 struct cgraph_node *default_node = NULL;
30235 struct cgraph_function_version_info *node_v = NULL;
30236 struct cgraph_function_version_info *first_v = NULL;
30238 tree dispatch_decl = NULL;
30240 struct cgraph_function_version_info *default_version_info = NULL;
30242 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30244 node = cgraph_get_node (fn);
30245 gcc_assert (node != NULL);
30247 node_v = get_cgraph_node_version (node);
30248 gcc_assert (node_v != NULL);
30250 if (node_v->dispatcher_resolver != NULL)
30251 return node_v->dispatcher_resolver;
30253 /* Find the default version and make it the first node. */
30254 first_v = node_v;
30255 /* Go to the beginning of the chain. */
30256 while (first_v->prev != NULL)
30257 first_v = first_v->prev;
30258 default_version_info = first_v;
30259 while (default_version_info != NULL)
30261 if (is_function_default_version
30262 (default_version_info->this_node->decl))
30263 break;
30264 default_version_info = default_version_info->next;
30267 /* If there is no default node, just return NULL. */
30268 if (default_version_info == NULL)
30269 return NULL;
30271 /* Make default info the first node. */
30272 if (first_v != default_version_info)
30274 default_version_info->prev->next = default_version_info->next;
30275 if (default_version_info->next)
30276 default_version_info->next->prev = default_version_info->prev;
30277 first_v->prev = default_version_info;
30278 default_version_info->next = first_v;
30279 default_version_info->prev = NULL;
30282 default_node = default_version_info->this_node;
30284 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30285 if (targetm.has_ifunc_p ())
30287 struct cgraph_function_version_info *it_v = NULL;
30288 struct cgraph_node *dispatcher_node = NULL;
30289 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30291 /* Right now, the dispatching is done via ifunc. */
30292 dispatch_decl = make_dispatcher_decl (default_node->decl);
30294 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30295 gcc_assert (dispatcher_node != NULL);
30296 dispatcher_node->dispatcher_function = 1;
30297 dispatcher_version_info
30298 = insert_new_cgraph_node_version (dispatcher_node);
30299 dispatcher_version_info->next = default_version_info;
30300 dispatcher_node->definition = 1;
30302 /* Set the dispatcher for all the versions. */
30303 it_v = default_version_info;
30304 while (it_v != NULL)
30306 it_v->dispatcher_resolver = dispatch_decl;
30307 it_v = it_v->next;
30310 else
30311 #endif
30313 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30314 "multiversioning needs ifunc which is not supported "
30315 "on this target");
30318 return dispatch_decl;
30321 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30322 it to CHAIN. */
30324 static tree
30325 make_attribute (const char *name, const char *arg_name, tree chain)
30327 tree attr_name;
30328 tree attr_arg_name;
30329 tree attr_args;
30330 tree attr;
30332 attr_name = get_identifier (name);
30333 attr_arg_name = build_string (strlen (arg_name), arg_name);
30334 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30335 attr = tree_cons (attr_name, attr_args, chain);
30336 return attr;
30339 /* Make the resolver function decl to dispatch the versions of
30340 a multi-versioned function, DEFAULT_DECL. Create an
30341 empty basic block in the resolver and store the pointer in
30342 EMPTY_BB. Return the decl of the resolver function. */
30344 static tree
30345 make_resolver_func (const tree default_decl,
30346 const tree dispatch_decl,
30347 basic_block *empty_bb)
30349 char *resolver_name;
30350 tree decl, type, decl_name, t;
30351 bool is_uniq = false;
30353 /* IFUNC's have to be globally visible. So, if the default_decl is
30354 not, then the name of the IFUNC should be made unique. */
30355 if (TREE_PUBLIC (default_decl) == 0)
30356 is_uniq = true;
30358 /* Append the filename to the resolver function if the versions are
30359 not externally visible. This is because the resolver function has
30360 to be externally visible for the loader to find it. So, appending
30361 the filename will prevent conflicts with a resolver function from
30362 another module which is based on the same version name. */
30363 resolver_name = make_name (default_decl, "resolver", is_uniq);
30365 /* The resolver function should return a (void *). */
30366 type = build_function_type_list (ptr_type_node, NULL_TREE);
30368 decl = build_fn_decl (resolver_name, type);
30369 decl_name = get_identifier (resolver_name);
30370 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30372 DECL_NAME (decl) = decl_name;
30373 TREE_USED (decl) = 1;
30374 DECL_ARTIFICIAL (decl) = 1;
30375 DECL_IGNORED_P (decl) = 0;
30376 /* IFUNC resolvers have to be externally visible. */
30377 TREE_PUBLIC (decl) = 1;
30378 DECL_UNINLINABLE (decl) = 1;
30380 /* Resolver is not external, body is generated. */
30381 DECL_EXTERNAL (decl) = 0;
30382 DECL_EXTERNAL (dispatch_decl) = 0;
30384 DECL_CONTEXT (decl) = NULL_TREE;
30385 DECL_INITIAL (decl) = make_node (BLOCK);
30386 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30388 if (DECL_COMDAT_GROUP (default_decl)
30389 || TREE_PUBLIC (default_decl))
30391 /* In this case, each translation unit with a call to this
30392 versioned function will put out a resolver. Ensure it
30393 is comdat to keep just one copy. */
30394 DECL_COMDAT (decl) = 1;
30395 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30397 /* Build result decl and add to function_decl. */
30398 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30399 DECL_ARTIFICIAL (t) = 1;
30400 DECL_IGNORED_P (t) = 1;
30401 DECL_RESULT (decl) = t;
30403 gimplify_function_tree (decl);
30404 push_cfun (DECL_STRUCT_FUNCTION (decl));
30405 *empty_bb = init_lowered_empty_function (decl, false);
30407 cgraph_add_new_function (decl, true);
30408 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30410 pop_cfun ();
30412 gcc_assert (dispatch_decl != NULL);
30413 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30414 DECL_ATTRIBUTES (dispatch_decl)
30415 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30417 /* Create the alias for dispatch to resolver here. */
30418 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30419 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30420 XDELETEVEC (resolver_name);
30421 return decl;
30424 /* Generate the dispatching code body to dispatch multi-versioned function
30425 DECL. The target hook is called to process the "target" attributes and
30426 provide the code to dispatch the right function at run-time. NODE points
30427 to the dispatcher decl whose body will be created. */
30429 static tree
30430 ix86_generate_version_dispatcher_body (void *node_p)
30432 tree resolver_decl;
30433 basic_block empty_bb;
30434 vec<tree> fn_ver_vec = vNULL;
30435 tree default_ver_decl;
30436 struct cgraph_node *versn;
30437 struct cgraph_node *node;
30439 struct cgraph_function_version_info *node_version_info = NULL;
30440 struct cgraph_function_version_info *versn_info = NULL;
30442 node = (cgraph_node *)node_p;
30444 node_version_info = get_cgraph_node_version (node);
30445 gcc_assert (node->dispatcher_function
30446 && node_version_info != NULL);
30448 if (node_version_info->dispatcher_resolver)
30449 return node_version_info->dispatcher_resolver;
30451 /* The first version in the chain corresponds to the default version. */
30452 default_ver_decl = node_version_info->next->this_node->decl;
30454 /* node is going to be an alias, so remove the finalized bit. */
30455 node->definition = false;
30457 resolver_decl = make_resolver_func (default_ver_decl,
30458 node->decl, &empty_bb);
30460 node_version_info->dispatcher_resolver = resolver_decl;
30462 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30464 fn_ver_vec.create (2);
30466 for (versn_info = node_version_info->next; versn_info;
30467 versn_info = versn_info->next)
30469 versn = versn_info->this_node;
30470 /* Check for virtual functions here again, as by this time it should
30471 have been determined if this function needs a vtable index or
30472 not. This happens for methods in derived classes that override
30473 virtual methods in base classes but are not explicitly marked as
30474 virtual. */
30475 if (DECL_VINDEX (versn->decl))
30476 sorry ("Virtual function multiversioning not supported");
30478 fn_ver_vec.safe_push (versn->decl);
30481 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30482 fn_ver_vec.release ();
30483 rebuild_cgraph_edges ();
30484 pop_cfun ();
30485 return resolver_decl;
30487 /* This builds the processor_model struct type defined in
30488 libgcc/config/i386/cpuinfo.c */
30490 static tree
30491 build_processor_model_struct (void)
30493 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30494 "__cpu_features"};
30495 tree field = NULL_TREE, field_chain = NULL_TREE;
30496 int i;
30497 tree type = make_node (RECORD_TYPE);
30499 /* The first 3 fields are unsigned int. */
30500 for (i = 0; i < 3; ++i)
30502 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30503 get_identifier (field_name[i]), unsigned_type_node);
30504 if (field_chain != NULL_TREE)
30505 DECL_CHAIN (field) = field_chain;
30506 field_chain = field;
30509 /* The last field is an array of unsigned integers of size one. */
30510 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30511 get_identifier (field_name[3]),
30512 build_array_type (unsigned_type_node,
30513 build_index_type (size_one_node)));
30514 if (field_chain != NULL_TREE)
30515 DECL_CHAIN (field) = field_chain;
30516 field_chain = field;
30518 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30519 return type;
30522 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30524 static tree
30525 make_var_decl (tree type, const char *name)
30527 tree new_decl;
30529 new_decl = build_decl (UNKNOWN_LOCATION,
30530 VAR_DECL,
30531 get_identifier(name),
30532 type);
30534 DECL_EXTERNAL (new_decl) = 1;
30535 TREE_STATIC (new_decl) = 1;
30536 TREE_PUBLIC (new_decl) = 1;
30537 DECL_INITIAL (new_decl) = 0;
30538 DECL_ARTIFICIAL (new_decl) = 0;
30539 DECL_PRESERVE_P (new_decl) = 1;
30541 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30542 assemble_variable (new_decl, 0, 0, 0);
30544 return new_decl;
30547 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30548 into an integer defined in libgcc/config/i386/cpuinfo.c */
30550 static tree
30551 fold_builtin_cpu (tree fndecl, tree *args)
30553 unsigned int i;
30554 enum ix86_builtins fn_code = (enum ix86_builtins)
30555 DECL_FUNCTION_CODE (fndecl);
30556 tree param_string_cst = NULL;
30558 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30559 enum processor_features
30561 F_CMOV = 0,
30562 F_MMX,
30563 F_POPCNT,
30564 F_SSE,
30565 F_SSE2,
30566 F_SSE3,
30567 F_SSSE3,
30568 F_SSE4_1,
30569 F_SSE4_2,
30570 F_AVX,
30571 F_AVX2,
30572 F_MAX
30575 /* These are the values for vendor types and cpu types and subtypes
30576 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30577 the corresponding start value. */
30578 enum processor_model
30580 M_INTEL = 1,
30581 M_AMD,
30582 M_CPU_TYPE_START,
30583 M_INTEL_ATOM,
30584 M_INTEL_CORE2,
30585 M_INTEL_COREI7,
30586 M_AMDFAM10H,
30587 M_AMDFAM15H,
30588 M_INTEL_SLM,
30589 M_CPU_SUBTYPE_START,
30590 M_INTEL_COREI7_NEHALEM,
30591 M_INTEL_COREI7_WESTMERE,
30592 M_INTEL_COREI7_SANDYBRIDGE,
30593 M_AMDFAM10H_BARCELONA,
30594 M_AMDFAM10H_SHANGHAI,
30595 M_AMDFAM10H_ISTANBUL,
30596 M_AMDFAM15H_BDVER1,
30597 M_AMDFAM15H_BDVER2,
30598 M_AMDFAM15H_BDVER3
30601 static struct _arch_names_table
30603 const char *const name;
30604 const enum processor_model model;
30606 const arch_names_table[] =
30608 {"amd", M_AMD},
30609 {"intel", M_INTEL},
30610 {"atom", M_INTEL_ATOM},
30611 {"slm", M_INTEL_SLM},
30612 {"core2", M_INTEL_CORE2},
30613 {"corei7", M_INTEL_COREI7},
30614 {"nehalem", M_INTEL_COREI7_NEHALEM},
30615 {"westmere", M_INTEL_COREI7_WESTMERE},
30616 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30617 {"amdfam10h", M_AMDFAM10H},
30618 {"barcelona", M_AMDFAM10H_BARCELONA},
30619 {"shanghai", M_AMDFAM10H_SHANGHAI},
30620 {"istanbul", M_AMDFAM10H_ISTANBUL},
30621 {"amdfam15h", M_AMDFAM15H},
30622 {"bdver1", M_AMDFAM15H_BDVER1},
30623 {"bdver2", M_AMDFAM15H_BDVER2},
30624 {"bdver3", M_AMDFAM15H_BDVER3},
30627 static struct _isa_names_table
30629 const char *const name;
30630 const enum processor_features feature;
30632 const isa_names_table[] =
30634 {"cmov", F_CMOV},
30635 {"mmx", F_MMX},
30636 {"popcnt", F_POPCNT},
30637 {"sse", F_SSE},
30638 {"sse2", F_SSE2},
30639 {"sse3", F_SSE3},
30640 {"ssse3", F_SSSE3},
30641 {"sse4.1", F_SSE4_1},
30642 {"sse4.2", F_SSE4_2},
30643 {"avx", F_AVX},
30644 {"avx2", F_AVX2}
30647 tree __processor_model_type = build_processor_model_struct ();
30648 tree __cpu_model_var = make_var_decl (__processor_model_type,
30649 "__cpu_model");
30652 varpool_add_new_variable (__cpu_model_var);
30654 gcc_assert ((args != NULL) && (*args != NULL));
30656 param_string_cst = *args;
30657 while (param_string_cst
30658 && TREE_CODE (param_string_cst) != STRING_CST)
30660 /* *args must be a expr that can contain other EXPRS leading to a
30661 STRING_CST. */
30662 if (!EXPR_P (param_string_cst))
30664 error ("Parameter to builtin must be a string constant or literal");
30665 return integer_zero_node;
30667 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30670 gcc_assert (param_string_cst);
30672 if (fn_code == IX86_BUILTIN_CPU_IS)
30674 tree ref;
30675 tree field;
30676 tree final;
30678 unsigned int field_val = 0;
30679 unsigned int NUM_ARCH_NAMES
30680 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30682 for (i = 0; i < NUM_ARCH_NAMES; i++)
30683 if (strcmp (arch_names_table[i].name,
30684 TREE_STRING_POINTER (param_string_cst)) == 0)
30685 break;
30687 if (i == NUM_ARCH_NAMES)
30689 error ("Parameter to builtin not valid: %s",
30690 TREE_STRING_POINTER (param_string_cst));
30691 return integer_zero_node;
30694 field = TYPE_FIELDS (__processor_model_type);
30695 field_val = arch_names_table[i].model;
30697 /* CPU types are stored in the next field. */
30698 if (field_val > M_CPU_TYPE_START
30699 && field_val < M_CPU_SUBTYPE_START)
30701 field = DECL_CHAIN (field);
30702 field_val -= M_CPU_TYPE_START;
30705 /* CPU subtypes are stored in the next field. */
30706 if (field_val > M_CPU_SUBTYPE_START)
30708 field = DECL_CHAIN ( DECL_CHAIN (field));
30709 field_val -= M_CPU_SUBTYPE_START;
30712 /* Get the appropriate field in __cpu_model. */
30713 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30714 field, NULL_TREE);
30716 /* Check the value. */
30717 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30718 build_int_cstu (unsigned_type_node, field_val));
30719 return build1 (CONVERT_EXPR, integer_type_node, final);
30721 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30723 tree ref;
30724 tree array_elt;
30725 tree field;
30726 tree final;
30728 unsigned int field_val = 0;
30729 unsigned int NUM_ISA_NAMES
30730 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30732 for (i = 0; i < NUM_ISA_NAMES; i++)
30733 if (strcmp (isa_names_table[i].name,
30734 TREE_STRING_POINTER (param_string_cst)) == 0)
30735 break;
30737 if (i == NUM_ISA_NAMES)
30739 error ("Parameter to builtin not valid: %s",
30740 TREE_STRING_POINTER (param_string_cst));
30741 return integer_zero_node;
30744 field = TYPE_FIELDS (__processor_model_type);
30745 /* Get the last field, which is __cpu_features. */
30746 while (DECL_CHAIN (field))
30747 field = DECL_CHAIN (field);
30749 /* Get the appropriate field: __cpu_model.__cpu_features */
30750 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30751 field, NULL_TREE);
30753 /* Access the 0th element of __cpu_features array. */
30754 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30755 integer_zero_node, NULL_TREE, NULL_TREE);
30757 field_val = (1 << isa_names_table[i].feature);
30758 /* Return __cpu_model.__cpu_features[0] & field_val */
30759 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30760 build_int_cstu (unsigned_type_node, field_val));
30761 return build1 (CONVERT_EXPR, integer_type_node, final);
30763 gcc_unreachable ();
30766 static tree
30767 ix86_fold_builtin (tree fndecl, int n_args,
30768 tree *args, bool ignore ATTRIBUTE_UNUSED)
30770 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30772 enum ix86_builtins fn_code = (enum ix86_builtins)
30773 DECL_FUNCTION_CODE (fndecl);
30774 if (fn_code == IX86_BUILTIN_CPU_IS
30775 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30777 gcc_assert (n_args == 1);
30778 return fold_builtin_cpu (fndecl, args);
30782 #ifdef SUBTARGET_FOLD_BUILTIN
30783 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30784 #endif
30786 return NULL_TREE;
30789 /* Make builtins to detect cpu type and features supported. NAME is
30790 the builtin name, CODE is the builtin code, and FTYPE is the function
30791 type of the builtin. */
30793 static void
30794 make_cpu_type_builtin (const char* name, int code,
30795 enum ix86_builtin_func_type ftype, bool is_const)
30797 tree decl;
30798 tree type;
30800 type = ix86_get_builtin_func_type (ftype);
30801 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30802 NULL, NULL_TREE);
30803 gcc_assert (decl != NULL_TREE);
30804 ix86_builtins[(int) code] = decl;
30805 TREE_READONLY (decl) = is_const;
30808 /* Make builtins to get CPU type and features supported. The created
30809 builtins are :
30811 __builtin_cpu_init (), to detect cpu type and features,
30812 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30813 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30816 static void
30817 ix86_init_platform_type_builtins (void)
30819 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30820 INT_FTYPE_VOID, false);
30821 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30822 INT_FTYPE_PCCHAR, true);
30823 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30824 INT_FTYPE_PCCHAR, true);
30827 /* Internal method for ix86_init_builtins. */
30829 static void
30830 ix86_init_builtins_va_builtins_abi (void)
30832 tree ms_va_ref, sysv_va_ref;
30833 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30834 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30835 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30836 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30838 if (!TARGET_64BIT)
30839 return;
30840 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30841 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30842 ms_va_ref = build_reference_type (ms_va_list_type_node);
30843 sysv_va_ref =
30844 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30846 fnvoid_va_end_ms =
30847 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30848 fnvoid_va_start_ms =
30849 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30850 fnvoid_va_end_sysv =
30851 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30852 fnvoid_va_start_sysv =
30853 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30854 NULL_TREE);
30855 fnvoid_va_copy_ms =
30856 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30857 NULL_TREE);
30858 fnvoid_va_copy_sysv =
30859 build_function_type_list (void_type_node, sysv_va_ref,
30860 sysv_va_ref, NULL_TREE);
30862 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30863 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30864 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30865 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30866 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30867 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30868 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30869 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30870 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
30871 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30872 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
30873 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30876 static void
30877 ix86_init_builtin_types (void)
30879 tree float128_type_node, float80_type_node;
30881 /* The __float80 type. */
30882 float80_type_node = long_double_type_node;
30883 if (TYPE_MODE (float80_type_node) != XFmode)
30885 /* The __float80 type. */
30886 float80_type_node = make_node (REAL_TYPE);
30888 TYPE_PRECISION (float80_type_node) = 80;
30889 layout_type (float80_type_node);
30891 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
30893 /* The __float128 type. */
30894 float128_type_node = make_node (REAL_TYPE);
30895 TYPE_PRECISION (float128_type_node) = 128;
30896 layout_type (float128_type_node);
30897 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
30899 /* This macro is built by i386-builtin-types.awk. */
30900 DEFINE_BUILTIN_PRIMITIVE_TYPES;
30903 static void
30904 ix86_init_builtins (void)
30906 tree t;
30908 ix86_init_builtin_types ();
30910 /* Builtins to get CPU type and features. */
30911 ix86_init_platform_type_builtins ();
30913 /* TFmode support builtins. */
30914 def_builtin_const (0, "__builtin_infq",
30915 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
30916 def_builtin_const (0, "__builtin_huge_valq",
30917 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
30919 /* We will expand them to normal call if SSE isn't available since
30920 they are used by libgcc. */
30921 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
30922 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
30923 BUILT_IN_MD, "__fabstf2", NULL_TREE);
30924 TREE_READONLY (t) = 1;
30925 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
30927 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
30928 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
30929 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
30930 TREE_READONLY (t) = 1;
30931 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
30933 ix86_init_tm_builtins ();
30934 ix86_init_mmx_sse_builtins ();
30936 if (TARGET_LP64)
30937 ix86_init_builtins_va_builtins_abi ();
30939 #ifdef SUBTARGET_INIT_BUILTINS
30940 SUBTARGET_INIT_BUILTINS;
30941 #endif
30944 /* Return the ix86 builtin for CODE. */
30946 static tree
30947 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30949 if (code >= IX86_BUILTIN_MAX)
30950 return error_mark_node;
30952 return ix86_builtins[code];
30955 /* Errors in the source file can cause expand_expr to return const0_rtx
30956 where we expect a vector. To avoid crashing, use one of the vector
30957 clear instructions. */
30958 static rtx
30959 safe_vector_operand (rtx x, enum machine_mode mode)
30961 if (x == const0_rtx)
30962 x = CONST0_RTX (mode);
30963 return x;
30966 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30968 static rtx
30969 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30971 rtx pat;
30972 tree arg0 = CALL_EXPR_ARG (exp, 0);
30973 tree arg1 = CALL_EXPR_ARG (exp, 1);
30974 rtx op0 = expand_normal (arg0);
30975 rtx op1 = expand_normal (arg1);
30976 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30977 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30978 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30980 if (VECTOR_MODE_P (mode0))
30981 op0 = safe_vector_operand (op0, mode0);
30982 if (VECTOR_MODE_P (mode1))
30983 op1 = safe_vector_operand (op1, mode1);
30985 if (optimize || !target
30986 || GET_MODE (target) != tmode
30987 || !insn_data[icode].operand[0].predicate (target, tmode))
30988 target = gen_reg_rtx (tmode);
30990 if (GET_MODE (op1) == SImode && mode1 == TImode)
30992 rtx x = gen_reg_rtx (V4SImode);
30993 emit_insn (gen_sse2_loadd (x, op1));
30994 op1 = gen_lowpart (TImode, x);
30997 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30998 op0 = copy_to_mode_reg (mode0, op0);
30999 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31000 op1 = copy_to_mode_reg (mode1, op1);
31002 pat = GEN_FCN (icode) (target, op0, op1);
31003 if (! pat)
31004 return 0;
31006 emit_insn (pat);
31008 return target;
31011 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31013 static rtx
31014 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31015 enum ix86_builtin_func_type m_type,
31016 enum rtx_code sub_code)
31018 rtx pat;
31019 int i;
31020 int nargs;
31021 bool comparison_p = false;
31022 bool tf_p = false;
31023 bool last_arg_constant = false;
31024 int num_memory = 0;
31025 struct {
31026 rtx op;
31027 enum machine_mode mode;
31028 } args[4];
31030 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31032 switch (m_type)
31034 case MULTI_ARG_4_DF2_DI_I:
31035 case MULTI_ARG_4_DF2_DI_I1:
31036 case MULTI_ARG_4_SF2_SI_I:
31037 case MULTI_ARG_4_SF2_SI_I1:
31038 nargs = 4;
31039 last_arg_constant = true;
31040 break;
31042 case MULTI_ARG_3_SF:
31043 case MULTI_ARG_3_DF:
31044 case MULTI_ARG_3_SF2:
31045 case MULTI_ARG_3_DF2:
31046 case MULTI_ARG_3_DI:
31047 case MULTI_ARG_3_SI:
31048 case MULTI_ARG_3_SI_DI:
31049 case MULTI_ARG_3_HI:
31050 case MULTI_ARG_3_HI_SI:
31051 case MULTI_ARG_3_QI:
31052 case MULTI_ARG_3_DI2:
31053 case MULTI_ARG_3_SI2:
31054 case MULTI_ARG_3_HI2:
31055 case MULTI_ARG_3_QI2:
31056 nargs = 3;
31057 break;
31059 case MULTI_ARG_2_SF:
31060 case MULTI_ARG_2_DF:
31061 case MULTI_ARG_2_DI:
31062 case MULTI_ARG_2_SI:
31063 case MULTI_ARG_2_HI:
31064 case MULTI_ARG_2_QI:
31065 nargs = 2;
31066 break;
31068 case MULTI_ARG_2_DI_IMM:
31069 case MULTI_ARG_2_SI_IMM:
31070 case MULTI_ARG_2_HI_IMM:
31071 case MULTI_ARG_2_QI_IMM:
31072 nargs = 2;
31073 last_arg_constant = true;
31074 break;
31076 case MULTI_ARG_1_SF:
31077 case MULTI_ARG_1_DF:
31078 case MULTI_ARG_1_SF2:
31079 case MULTI_ARG_1_DF2:
31080 case MULTI_ARG_1_DI:
31081 case MULTI_ARG_1_SI:
31082 case MULTI_ARG_1_HI:
31083 case MULTI_ARG_1_QI:
31084 case MULTI_ARG_1_SI_DI:
31085 case MULTI_ARG_1_HI_DI:
31086 case MULTI_ARG_1_HI_SI:
31087 case MULTI_ARG_1_QI_DI:
31088 case MULTI_ARG_1_QI_SI:
31089 case MULTI_ARG_1_QI_HI:
31090 nargs = 1;
31091 break;
31093 case MULTI_ARG_2_DI_CMP:
31094 case MULTI_ARG_2_SI_CMP:
31095 case MULTI_ARG_2_HI_CMP:
31096 case MULTI_ARG_2_QI_CMP:
31097 nargs = 2;
31098 comparison_p = true;
31099 break;
31101 case MULTI_ARG_2_SF_TF:
31102 case MULTI_ARG_2_DF_TF:
31103 case MULTI_ARG_2_DI_TF:
31104 case MULTI_ARG_2_SI_TF:
31105 case MULTI_ARG_2_HI_TF:
31106 case MULTI_ARG_2_QI_TF:
31107 nargs = 2;
31108 tf_p = true;
31109 break;
31111 default:
31112 gcc_unreachable ();
31115 if (optimize || !target
31116 || GET_MODE (target) != tmode
31117 || !insn_data[icode].operand[0].predicate (target, tmode))
31118 target = gen_reg_rtx (tmode);
31120 gcc_assert (nargs <= 4);
31122 for (i = 0; i < nargs; i++)
31124 tree arg = CALL_EXPR_ARG (exp, i);
31125 rtx op = expand_normal (arg);
31126 int adjust = (comparison_p) ? 1 : 0;
31127 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31129 if (last_arg_constant && i == nargs - 1)
31131 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31133 enum insn_code new_icode = icode;
31134 switch (icode)
31136 case CODE_FOR_xop_vpermil2v2df3:
31137 case CODE_FOR_xop_vpermil2v4sf3:
31138 case CODE_FOR_xop_vpermil2v4df3:
31139 case CODE_FOR_xop_vpermil2v8sf3:
31140 error ("the last argument must be a 2-bit immediate");
31141 return gen_reg_rtx (tmode);
31142 case CODE_FOR_xop_rotlv2di3:
31143 new_icode = CODE_FOR_rotlv2di3;
31144 goto xop_rotl;
31145 case CODE_FOR_xop_rotlv4si3:
31146 new_icode = CODE_FOR_rotlv4si3;
31147 goto xop_rotl;
31148 case CODE_FOR_xop_rotlv8hi3:
31149 new_icode = CODE_FOR_rotlv8hi3;
31150 goto xop_rotl;
31151 case CODE_FOR_xop_rotlv16qi3:
31152 new_icode = CODE_FOR_rotlv16qi3;
31153 xop_rotl:
31154 if (CONST_INT_P (op))
31156 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31157 op = GEN_INT (INTVAL (op) & mask);
31158 gcc_checking_assert
31159 (insn_data[icode].operand[i + 1].predicate (op, mode));
31161 else
31163 gcc_checking_assert
31164 (nargs == 2
31165 && insn_data[new_icode].operand[0].mode == tmode
31166 && insn_data[new_icode].operand[1].mode == tmode
31167 && insn_data[new_icode].operand[2].mode == mode
31168 && insn_data[new_icode].operand[0].predicate
31169 == insn_data[icode].operand[0].predicate
31170 && insn_data[new_icode].operand[1].predicate
31171 == insn_data[icode].operand[1].predicate);
31172 icode = new_icode;
31173 goto non_constant;
31175 break;
31176 default:
31177 gcc_unreachable ();
31181 else
31183 non_constant:
31184 if (VECTOR_MODE_P (mode))
31185 op = safe_vector_operand (op, mode);
31187 /* If we aren't optimizing, only allow one memory operand to be
31188 generated. */
31189 if (memory_operand (op, mode))
31190 num_memory++;
31192 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31194 if (optimize
31195 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31196 || num_memory > 1)
31197 op = force_reg (mode, op);
31200 args[i].op = op;
31201 args[i].mode = mode;
31204 switch (nargs)
31206 case 1:
31207 pat = GEN_FCN (icode) (target, args[0].op);
31208 break;
31210 case 2:
31211 if (tf_p)
31212 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31213 GEN_INT ((int)sub_code));
31214 else if (! comparison_p)
31215 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31216 else
31218 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31219 args[0].op,
31220 args[1].op);
31222 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31224 break;
31226 case 3:
31227 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31228 break;
31230 case 4:
31231 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31232 break;
31234 default:
31235 gcc_unreachable ();
31238 if (! pat)
31239 return 0;
31241 emit_insn (pat);
31242 return target;
31245 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31246 insns with vec_merge. */
31248 static rtx
31249 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31250 rtx target)
31252 rtx pat;
31253 tree arg0 = CALL_EXPR_ARG (exp, 0);
31254 rtx op1, op0 = expand_normal (arg0);
31255 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31256 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31258 if (optimize || !target
31259 || GET_MODE (target) != tmode
31260 || !insn_data[icode].operand[0].predicate (target, tmode))
31261 target = gen_reg_rtx (tmode);
31263 if (VECTOR_MODE_P (mode0))
31264 op0 = safe_vector_operand (op0, mode0);
31266 if ((optimize && !register_operand (op0, mode0))
31267 || !insn_data[icode].operand[1].predicate (op0, mode0))
31268 op0 = copy_to_mode_reg (mode0, op0);
31270 op1 = op0;
31271 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31272 op1 = copy_to_mode_reg (mode0, op1);
31274 pat = GEN_FCN (icode) (target, op0, op1);
31275 if (! pat)
31276 return 0;
31277 emit_insn (pat);
31278 return target;
31281 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31283 static rtx
31284 ix86_expand_sse_compare (const struct builtin_description *d,
31285 tree exp, rtx target, bool swap)
31287 rtx pat;
31288 tree arg0 = CALL_EXPR_ARG (exp, 0);
31289 tree arg1 = CALL_EXPR_ARG (exp, 1);
31290 rtx op0 = expand_normal (arg0);
31291 rtx op1 = expand_normal (arg1);
31292 rtx op2;
31293 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31294 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31295 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31296 enum rtx_code comparison = d->comparison;
31298 if (VECTOR_MODE_P (mode0))
31299 op0 = safe_vector_operand (op0, mode0);
31300 if (VECTOR_MODE_P (mode1))
31301 op1 = safe_vector_operand (op1, mode1);
31303 /* Swap operands if we have a comparison that isn't available in
31304 hardware. */
31305 if (swap)
31307 rtx tmp = gen_reg_rtx (mode1);
31308 emit_move_insn (tmp, op1);
31309 op1 = op0;
31310 op0 = tmp;
31313 if (optimize || !target
31314 || GET_MODE (target) != tmode
31315 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31316 target = gen_reg_rtx (tmode);
31318 if ((optimize && !register_operand (op0, mode0))
31319 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31320 op0 = copy_to_mode_reg (mode0, op0);
31321 if ((optimize && !register_operand (op1, mode1))
31322 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31323 op1 = copy_to_mode_reg (mode1, op1);
31325 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31326 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31327 if (! pat)
31328 return 0;
31329 emit_insn (pat);
31330 return target;
31333 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31335 static rtx
31336 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31337 rtx target)
31339 rtx pat;
31340 tree arg0 = CALL_EXPR_ARG (exp, 0);
31341 tree arg1 = CALL_EXPR_ARG (exp, 1);
31342 rtx op0 = expand_normal (arg0);
31343 rtx op1 = expand_normal (arg1);
31344 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31345 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31346 enum rtx_code comparison = d->comparison;
31348 if (VECTOR_MODE_P (mode0))
31349 op0 = safe_vector_operand (op0, mode0);
31350 if (VECTOR_MODE_P (mode1))
31351 op1 = safe_vector_operand (op1, mode1);
31353 /* Swap operands if we have a comparison that isn't available in
31354 hardware. */
31355 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31357 rtx tmp = op1;
31358 op1 = op0;
31359 op0 = tmp;
31362 target = gen_reg_rtx (SImode);
31363 emit_move_insn (target, const0_rtx);
31364 target = gen_rtx_SUBREG (QImode, target, 0);
31366 if ((optimize && !register_operand (op0, mode0))
31367 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31368 op0 = copy_to_mode_reg (mode0, op0);
31369 if ((optimize && !register_operand (op1, mode1))
31370 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31371 op1 = copy_to_mode_reg (mode1, op1);
31373 pat = GEN_FCN (d->icode) (op0, op1);
31374 if (! pat)
31375 return 0;
31376 emit_insn (pat);
31377 emit_insn (gen_rtx_SET (VOIDmode,
31378 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31379 gen_rtx_fmt_ee (comparison, QImode,
31380 SET_DEST (pat),
31381 const0_rtx)));
31383 return SUBREG_REG (target);
31386 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31388 static rtx
31389 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31390 rtx target)
31392 rtx pat;
31393 tree arg0 = CALL_EXPR_ARG (exp, 0);
31394 rtx op1, op0 = expand_normal (arg0);
31395 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31396 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31398 if (optimize || target == 0
31399 || GET_MODE (target) != tmode
31400 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31401 target = gen_reg_rtx (tmode);
31403 if (VECTOR_MODE_P (mode0))
31404 op0 = safe_vector_operand (op0, mode0);
31406 if ((optimize && !register_operand (op0, mode0))
31407 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31408 op0 = copy_to_mode_reg (mode0, op0);
31410 op1 = GEN_INT (d->comparison);
31412 pat = GEN_FCN (d->icode) (target, op0, op1);
31413 if (! pat)
31414 return 0;
31415 emit_insn (pat);
31416 return target;
31419 static rtx
31420 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31421 tree exp, rtx target)
31423 rtx pat;
31424 tree arg0 = CALL_EXPR_ARG (exp, 0);
31425 tree arg1 = CALL_EXPR_ARG (exp, 1);
31426 rtx op0 = expand_normal (arg0);
31427 rtx op1 = expand_normal (arg1);
31428 rtx op2;
31429 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31430 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31431 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31433 if (optimize || target == 0
31434 || GET_MODE (target) != tmode
31435 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31436 target = gen_reg_rtx (tmode);
31438 op0 = safe_vector_operand (op0, mode0);
31439 op1 = safe_vector_operand (op1, mode1);
31441 if ((optimize && !register_operand (op0, mode0))
31442 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31443 op0 = copy_to_mode_reg (mode0, op0);
31444 if ((optimize && !register_operand (op1, mode1))
31445 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31446 op1 = copy_to_mode_reg (mode1, op1);
31448 op2 = GEN_INT (d->comparison);
31450 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31451 if (! pat)
31452 return 0;
31453 emit_insn (pat);
31454 return target;
31457 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31459 static rtx
31460 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31461 rtx target)
31463 rtx pat;
31464 tree arg0 = CALL_EXPR_ARG (exp, 0);
31465 tree arg1 = CALL_EXPR_ARG (exp, 1);
31466 rtx op0 = expand_normal (arg0);
31467 rtx op1 = expand_normal (arg1);
31468 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31469 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31470 enum rtx_code comparison = d->comparison;
31472 if (VECTOR_MODE_P (mode0))
31473 op0 = safe_vector_operand (op0, mode0);
31474 if (VECTOR_MODE_P (mode1))
31475 op1 = safe_vector_operand (op1, mode1);
31477 target = gen_reg_rtx (SImode);
31478 emit_move_insn (target, const0_rtx);
31479 target = gen_rtx_SUBREG (QImode, target, 0);
31481 if ((optimize && !register_operand (op0, mode0))
31482 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31483 op0 = copy_to_mode_reg (mode0, op0);
31484 if ((optimize && !register_operand (op1, mode1))
31485 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31486 op1 = copy_to_mode_reg (mode1, op1);
31488 pat = GEN_FCN (d->icode) (op0, op1);
31489 if (! pat)
31490 return 0;
31491 emit_insn (pat);
31492 emit_insn (gen_rtx_SET (VOIDmode,
31493 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31494 gen_rtx_fmt_ee (comparison, QImode,
31495 SET_DEST (pat),
31496 const0_rtx)));
31498 return SUBREG_REG (target);
31501 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31503 static rtx
31504 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31505 tree exp, rtx target)
31507 rtx pat;
31508 tree arg0 = CALL_EXPR_ARG (exp, 0);
31509 tree arg1 = CALL_EXPR_ARG (exp, 1);
31510 tree arg2 = CALL_EXPR_ARG (exp, 2);
31511 tree arg3 = CALL_EXPR_ARG (exp, 3);
31512 tree arg4 = CALL_EXPR_ARG (exp, 4);
31513 rtx scratch0, scratch1;
31514 rtx op0 = expand_normal (arg0);
31515 rtx op1 = expand_normal (arg1);
31516 rtx op2 = expand_normal (arg2);
31517 rtx op3 = expand_normal (arg3);
31518 rtx op4 = expand_normal (arg4);
31519 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31521 tmode0 = insn_data[d->icode].operand[0].mode;
31522 tmode1 = insn_data[d->icode].operand[1].mode;
31523 modev2 = insn_data[d->icode].operand[2].mode;
31524 modei3 = insn_data[d->icode].operand[3].mode;
31525 modev4 = insn_data[d->icode].operand[4].mode;
31526 modei5 = insn_data[d->icode].operand[5].mode;
31527 modeimm = insn_data[d->icode].operand[6].mode;
31529 if (VECTOR_MODE_P (modev2))
31530 op0 = safe_vector_operand (op0, modev2);
31531 if (VECTOR_MODE_P (modev4))
31532 op2 = safe_vector_operand (op2, modev4);
31534 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31535 op0 = copy_to_mode_reg (modev2, op0);
31536 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31537 op1 = copy_to_mode_reg (modei3, op1);
31538 if ((optimize && !register_operand (op2, modev4))
31539 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31540 op2 = copy_to_mode_reg (modev4, op2);
31541 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31542 op3 = copy_to_mode_reg (modei5, op3);
31544 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31546 error ("the fifth argument must be an 8-bit immediate");
31547 return const0_rtx;
31550 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31552 if (optimize || !target
31553 || GET_MODE (target) != tmode0
31554 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31555 target = gen_reg_rtx (tmode0);
31557 scratch1 = gen_reg_rtx (tmode1);
31559 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31561 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31563 if (optimize || !target
31564 || GET_MODE (target) != tmode1
31565 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31566 target = gen_reg_rtx (tmode1);
31568 scratch0 = gen_reg_rtx (tmode0);
31570 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31572 else
31574 gcc_assert (d->flag);
31576 scratch0 = gen_reg_rtx (tmode0);
31577 scratch1 = gen_reg_rtx (tmode1);
31579 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31582 if (! pat)
31583 return 0;
31585 emit_insn (pat);
31587 if (d->flag)
31589 target = gen_reg_rtx (SImode);
31590 emit_move_insn (target, const0_rtx);
31591 target = gen_rtx_SUBREG (QImode, target, 0);
31593 emit_insn
31594 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31595 gen_rtx_fmt_ee (EQ, QImode,
31596 gen_rtx_REG ((enum machine_mode) d->flag,
31597 FLAGS_REG),
31598 const0_rtx)));
31599 return SUBREG_REG (target);
31601 else
31602 return target;
31606 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31608 static rtx
31609 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31610 tree exp, rtx target)
31612 rtx pat;
31613 tree arg0 = CALL_EXPR_ARG (exp, 0);
31614 tree arg1 = CALL_EXPR_ARG (exp, 1);
31615 tree arg2 = CALL_EXPR_ARG (exp, 2);
31616 rtx scratch0, scratch1;
31617 rtx op0 = expand_normal (arg0);
31618 rtx op1 = expand_normal (arg1);
31619 rtx op2 = expand_normal (arg2);
31620 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31622 tmode0 = insn_data[d->icode].operand[0].mode;
31623 tmode1 = insn_data[d->icode].operand[1].mode;
31624 modev2 = insn_data[d->icode].operand[2].mode;
31625 modev3 = insn_data[d->icode].operand[3].mode;
31626 modeimm = insn_data[d->icode].operand[4].mode;
31628 if (VECTOR_MODE_P (modev2))
31629 op0 = safe_vector_operand (op0, modev2);
31630 if (VECTOR_MODE_P (modev3))
31631 op1 = safe_vector_operand (op1, modev3);
31633 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31634 op0 = copy_to_mode_reg (modev2, op0);
31635 if ((optimize && !register_operand (op1, modev3))
31636 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31637 op1 = copy_to_mode_reg (modev3, op1);
31639 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31641 error ("the third argument must be an 8-bit immediate");
31642 return const0_rtx;
31645 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31647 if (optimize || !target
31648 || GET_MODE (target) != tmode0
31649 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31650 target = gen_reg_rtx (tmode0);
31652 scratch1 = gen_reg_rtx (tmode1);
31654 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31656 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31658 if (optimize || !target
31659 || GET_MODE (target) != tmode1
31660 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31661 target = gen_reg_rtx (tmode1);
31663 scratch0 = gen_reg_rtx (tmode0);
31665 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31667 else
31669 gcc_assert (d->flag);
31671 scratch0 = gen_reg_rtx (tmode0);
31672 scratch1 = gen_reg_rtx (tmode1);
31674 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31677 if (! pat)
31678 return 0;
31680 emit_insn (pat);
31682 if (d->flag)
31684 target = gen_reg_rtx (SImode);
31685 emit_move_insn (target, const0_rtx);
31686 target = gen_rtx_SUBREG (QImode, target, 0);
31688 emit_insn
31689 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31690 gen_rtx_fmt_ee (EQ, QImode,
31691 gen_rtx_REG ((enum machine_mode) d->flag,
31692 FLAGS_REG),
31693 const0_rtx)));
31694 return SUBREG_REG (target);
31696 else
31697 return target;
31700 /* Subroutine of ix86_expand_builtin to take care of insns with
31701 variable number of operands. */
31703 static rtx
31704 ix86_expand_args_builtin (const struct builtin_description *d,
31705 tree exp, rtx target)
31707 rtx pat, real_target;
31708 unsigned int i, nargs;
31709 unsigned int nargs_constant = 0;
31710 int num_memory = 0;
31711 struct
31713 rtx op;
31714 enum machine_mode mode;
31715 } args[4];
31716 bool last_arg_count = false;
31717 enum insn_code icode = d->icode;
31718 const struct insn_data_d *insn_p = &insn_data[icode];
31719 enum machine_mode tmode = insn_p->operand[0].mode;
31720 enum machine_mode rmode = VOIDmode;
31721 bool swap = false;
31722 enum rtx_code comparison = d->comparison;
31724 switch ((enum ix86_builtin_func_type) d->flag)
31726 case V2DF_FTYPE_V2DF_ROUND:
31727 case V4DF_FTYPE_V4DF_ROUND:
31728 case V4SF_FTYPE_V4SF_ROUND:
31729 case V8SF_FTYPE_V8SF_ROUND:
31730 case V4SI_FTYPE_V4SF_ROUND:
31731 case V8SI_FTYPE_V8SF_ROUND:
31732 return ix86_expand_sse_round (d, exp, target);
31733 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31734 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31735 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31736 case INT_FTYPE_V8SF_V8SF_PTEST:
31737 case INT_FTYPE_V4DI_V4DI_PTEST:
31738 case INT_FTYPE_V4DF_V4DF_PTEST:
31739 case INT_FTYPE_V4SF_V4SF_PTEST:
31740 case INT_FTYPE_V2DI_V2DI_PTEST:
31741 case INT_FTYPE_V2DF_V2DF_PTEST:
31742 return ix86_expand_sse_ptest (d, exp, target);
31743 case FLOAT128_FTYPE_FLOAT128:
31744 case FLOAT_FTYPE_FLOAT:
31745 case INT_FTYPE_INT:
31746 case UINT64_FTYPE_INT:
31747 case UINT16_FTYPE_UINT16:
31748 case INT64_FTYPE_INT64:
31749 case INT64_FTYPE_V4SF:
31750 case INT64_FTYPE_V2DF:
31751 case INT_FTYPE_V16QI:
31752 case INT_FTYPE_V8QI:
31753 case INT_FTYPE_V8SF:
31754 case INT_FTYPE_V4DF:
31755 case INT_FTYPE_V4SF:
31756 case INT_FTYPE_V2DF:
31757 case INT_FTYPE_V32QI:
31758 case V16QI_FTYPE_V16QI:
31759 case V8SI_FTYPE_V8SF:
31760 case V8SI_FTYPE_V4SI:
31761 case V8HI_FTYPE_V8HI:
31762 case V8HI_FTYPE_V16QI:
31763 case V8QI_FTYPE_V8QI:
31764 case V8SF_FTYPE_V8SF:
31765 case V8SF_FTYPE_V8SI:
31766 case V8SF_FTYPE_V4SF:
31767 case V8SF_FTYPE_V8HI:
31768 case V4SI_FTYPE_V4SI:
31769 case V4SI_FTYPE_V16QI:
31770 case V4SI_FTYPE_V4SF:
31771 case V4SI_FTYPE_V8SI:
31772 case V4SI_FTYPE_V8HI:
31773 case V4SI_FTYPE_V4DF:
31774 case V4SI_FTYPE_V2DF:
31775 case V4HI_FTYPE_V4HI:
31776 case V4DF_FTYPE_V4DF:
31777 case V4DF_FTYPE_V4SI:
31778 case V4DF_FTYPE_V4SF:
31779 case V4DF_FTYPE_V2DF:
31780 case V4SF_FTYPE_V4SF:
31781 case V4SF_FTYPE_V4SI:
31782 case V4SF_FTYPE_V8SF:
31783 case V4SF_FTYPE_V4DF:
31784 case V4SF_FTYPE_V8HI:
31785 case V4SF_FTYPE_V2DF:
31786 case V2DI_FTYPE_V2DI:
31787 case V2DI_FTYPE_V16QI:
31788 case V2DI_FTYPE_V8HI:
31789 case V2DI_FTYPE_V4SI:
31790 case V2DF_FTYPE_V2DF:
31791 case V2DF_FTYPE_V4SI:
31792 case V2DF_FTYPE_V4DF:
31793 case V2DF_FTYPE_V4SF:
31794 case V2DF_FTYPE_V2SI:
31795 case V2SI_FTYPE_V2SI:
31796 case V2SI_FTYPE_V4SF:
31797 case V2SI_FTYPE_V2SF:
31798 case V2SI_FTYPE_V2DF:
31799 case V2SF_FTYPE_V2SF:
31800 case V2SF_FTYPE_V2SI:
31801 case V32QI_FTYPE_V32QI:
31802 case V32QI_FTYPE_V16QI:
31803 case V16HI_FTYPE_V16HI:
31804 case V16HI_FTYPE_V8HI:
31805 case V8SI_FTYPE_V8SI:
31806 case V16HI_FTYPE_V16QI:
31807 case V8SI_FTYPE_V16QI:
31808 case V4DI_FTYPE_V16QI:
31809 case V8SI_FTYPE_V8HI:
31810 case V4DI_FTYPE_V8HI:
31811 case V4DI_FTYPE_V4SI:
31812 case V4DI_FTYPE_V2DI:
31813 nargs = 1;
31814 break;
31815 case V4SF_FTYPE_V4SF_VEC_MERGE:
31816 case V2DF_FTYPE_V2DF_VEC_MERGE:
31817 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31818 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31819 case V16QI_FTYPE_V16QI_V16QI:
31820 case V16QI_FTYPE_V8HI_V8HI:
31821 case V8QI_FTYPE_V8QI_V8QI:
31822 case V8QI_FTYPE_V4HI_V4HI:
31823 case V8HI_FTYPE_V8HI_V8HI:
31824 case V8HI_FTYPE_V16QI_V16QI:
31825 case V8HI_FTYPE_V4SI_V4SI:
31826 case V8SF_FTYPE_V8SF_V8SF:
31827 case V8SF_FTYPE_V8SF_V8SI:
31828 case V4SI_FTYPE_V4SI_V4SI:
31829 case V4SI_FTYPE_V8HI_V8HI:
31830 case V4SI_FTYPE_V4SF_V4SF:
31831 case V4SI_FTYPE_V2DF_V2DF:
31832 case V4HI_FTYPE_V4HI_V4HI:
31833 case V4HI_FTYPE_V8QI_V8QI:
31834 case V4HI_FTYPE_V2SI_V2SI:
31835 case V4DF_FTYPE_V4DF_V4DF:
31836 case V4DF_FTYPE_V4DF_V4DI:
31837 case V4SF_FTYPE_V4SF_V4SF:
31838 case V4SF_FTYPE_V4SF_V4SI:
31839 case V4SF_FTYPE_V4SF_V2SI:
31840 case V4SF_FTYPE_V4SF_V2DF:
31841 case V4SF_FTYPE_V4SF_DI:
31842 case V4SF_FTYPE_V4SF_SI:
31843 case V2DI_FTYPE_V2DI_V2DI:
31844 case V2DI_FTYPE_V16QI_V16QI:
31845 case V2DI_FTYPE_V4SI_V4SI:
31846 case V2UDI_FTYPE_V4USI_V4USI:
31847 case V2DI_FTYPE_V2DI_V16QI:
31848 case V2DI_FTYPE_V2DF_V2DF:
31849 case V2SI_FTYPE_V2SI_V2SI:
31850 case V2SI_FTYPE_V4HI_V4HI:
31851 case V2SI_FTYPE_V2SF_V2SF:
31852 case V2DF_FTYPE_V2DF_V2DF:
31853 case V2DF_FTYPE_V2DF_V4SF:
31854 case V2DF_FTYPE_V2DF_V2DI:
31855 case V2DF_FTYPE_V2DF_DI:
31856 case V2DF_FTYPE_V2DF_SI:
31857 case V2SF_FTYPE_V2SF_V2SF:
31858 case V1DI_FTYPE_V1DI_V1DI:
31859 case V1DI_FTYPE_V8QI_V8QI:
31860 case V1DI_FTYPE_V2SI_V2SI:
31861 case V32QI_FTYPE_V16HI_V16HI:
31862 case V16HI_FTYPE_V8SI_V8SI:
31863 case V32QI_FTYPE_V32QI_V32QI:
31864 case V16HI_FTYPE_V32QI_V32QI:
31865 case V16HI_FTYPE_V16HI_V16HI:
31866 case V8SI_FTYPE_V4DF_V4DF:
31867 case V8SI_FTYPE_V8SI_V8SI:
31868 case V8SI_FTYPE_V16HI_V16HI:
31869 case V4DI_FTYPE_V4DI_V4DI:
31870 case V4DI_FTYPE_V8SI_V8SI:
31871 case V4UDI_FTYPE_V8USI_V8USI:
31872 if (comparison == UNKNOWN)
31873 return ix86_expand_binop_builtin (icode, exp, target);
31874 nargs = 2;
31875 break;
31876 case V4SF_FTYPE_V4SF_V4SF_SWAP:
31877 case V2DF_FTYPE_V2DF_V2DF_SWAP:
31878 gcc_assert (comparison != UNKNOWN);
31879 nargs = 2;
31880 swap = true;
31881 break;
31882 case V16HI_FTYPE_V16HI_V8HI_COUNT:
31883 case V16HI_FTYPE_V16HI_SI_COUNT:
31884 case V8SI_FTYPE_V8SI_V4SI_COUNT:
31885 case V8SI_FTYPE_V8SI_SI_COUNT:
31886 case V4DI_FTYPE_V4DI_V2DI_COUNT:
31887 case V4DI_FTYPE_V4DI_INT_COUNT:
31888 case V8HI_FTYPE_V8HI_V8HI_COUNT:
31889 case V8HI_FTYPE_V8HI_SI_COUNT:
31890 case V4SI_FTYPE_V4SI_V4SI_COUNT:
31891 case V4SI_FTYPE_V4SI_SI_COUNT:
31892 case V4HI_FTYPE_V4HI_V4HI_COUNT:
31893 case V4HI_FTYPE_V4HI_SI_COUNT:
31894 case V2DI_FTYPE_V2DI_V2DI_COUNT:
31895 case V2DI_FTYPE_V2DI_SI_COUNT:
31896 case V2SI_FTYPE_V2SI_V2SI_COUNT:
31897 case V2SI_FTYPE_V2SI_SI_COUNT:
31898 case V1DI_FTYPE_V1DI_V1DI_COUNT:
31899 case V1DI_FTYPE_V1DI_SI_COUNT:
31900 nargs = 2;
31901 last_arg_count = true;
31902 break;
31903 case UINT64_FTYPE_UINT64_UINT64:
31904 case UINT_FTYPE_UINT_UINT:
31905 case UINT_FTYPE_UINT_USHORT:
31906 case UINT_FTYPE_UINT_UCHAR:
31907 case UINT16_FTYPE_UINT16_INT:
31908 case UINT8_FTYPE_UINT8_INT:
31909 nargs = 2;
31910 break;
31911 case V2DI_FTYPE_V2DI_INT_CONVERT:
31912 nargs = 2;
31913 rmode = V1TImode;
31914 nargs_constant = 1;
31915 break;
31916 case V4DI_FTYPE_V4DI_INT_CONVERT:
31917 nargs = 2;
31918 rmode = V2TImode;
31919 nargs_constant = 1;
31920 break;
31921 case V8HI_FTYPE_V8HI_INT:
31922 case V8HI_FTYPE_V8SF_INT:
31923 case V8HI_FTYPE_V4SF_INT:
31924 case V8SF_FTYPE_V8SF_INT:
31925 case V4SI_FTYPE_V4SI_INT:
31926 case V4SI_FTYPE_V8SI_INT:
31927 case V4HI_FTYPE_V4HI_INT:
31928 case V4DF_FTYPE_V4DF_INT:
31929 case V4SF_FTYPE_V4SF_INT:
31930 case V4SF_FTYPE_V8SF_INT:
31931 case V2DI_FTYPE_V2DI_INT:
31932 case V2DF_FTYPE_V2DF_INT:
31933 case V2DF_FTYPE_V4DF_INT:
31934 case V16HI_FTYPE_V16HI_INT:
31935 case V8SI_FTYPE_V8SI_INT:
31936 case V4DI_FTYPE_V4DI_INT:
31937 case V2DI_FTYPE_V4DI_INT:
31938 nargs = 2;
31939 nargs_constant = 1;
31940 break;
31941 case V16QI_FTYPE_V16QI_V16QI_V16QI:
31942 case V8SF_FTYPE_V8SF_V8SF_V8SF:
31943 case V4DF_FTYPE_V4DF_V4DF_V4DF:
31944 case V4SF_FTYPE_V4SF_V4SF_V4SF:
31945 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31946 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31947 nargs = 3;
31948 break;
31949 case V32QI_FTYPE_V32QI_V32QI_INT:
31950 case V16HI_FTYPE_V16HI_V16HI_INT:
31951 case V16QI_FTYPE_V16QI_V16QI_INT:
31952 case V4DI_FTYPE_V4DI_V4DI_INT:
31953 case V8HI_FTYPE_V8HI_V8HI_INT:
31954 case V8SI_FTYPE_V8SI_V8SI_INT:
31955 case V8SI_FTYPE_V8SI_V4SI_INT:
31956 case V8SF_FTYPE_V8SF_V8SF_INT:
31957 case V8SF_FTYPE_V8SF_V4SF_INT:
31958 case V4SI_FTYPE_V4SI_V4SI_INT:
31959 case V4DF_FTYPE_V4DF_V4DF_INT:
31960 case V4DF_FTYPE_V4DF_V2DF_INT:
31961 case V4SF_FTYPE_V4SF_V4SF_INT:
31962 case V2DI_FTYPE_V2DI_V2DI_INT:
31963 case V4DI_FTYPE_V4DI_V2DI_INT:
31964 case V2DF_FTYPE_V2DF_V2DF_INT:
31965 nargs = 3;
31966 nargs_constant = 1;
31967 break;
31968 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31969 nargs = 3;
31970 rmode = V4DImode;
31971 nargs_constant = 1;
31972 break;
31973 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31974 nargs = 3;
31975 rmode = V2DImode;
31976 nargs_constant = 1;
31977 break;
31978 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31979 nargs = 3;
31980 rmode = DImode;
31981 nargs_constant = 1;
31982 break;
31983 case V2DI_FTYPE_V2DI_UINT_UINT:
31984 nargs = 3;
31985 nargs_constant = 2;
31986 break;
31987 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31988 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31989 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31990 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31991 nargs = 4;
31992 nargs_constant = 1;
31993 break;
31994 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31995 nargs = 4;
31996 nargs_constant = 2;
31997 break;
31998 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31999 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32000 nargs = 4;
32001 break;
32002 default:
32003 gcc_unreachable ();
32006 gcc_assert (nargs <= ARRAY_SIZE (args));
32008 if (comparison != UNKNOWN)
32010 gcc_assert (nargs == 2);
32011 return ix86_expand_sse_compare (d, exp, target, swap);
32014 if (rmode == VOIDmode || rmode == tmode)
32016 if (optimize
32017 || target == 0
32018 || GET_MODE (target) != tmode
32019 || !insn_p->operand[0].predicate (target, tmode))
32020 target = gen_reg_rtx (tmode);
32021 real_target = target;
32023 else
32025 real_target = gen_reg_rtx (tmode);
32026 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32029 for (i = 0; i < nargs; i++)
32031 tree arg = CALL_EXPR_ARG (exp, i);
32032 rtx op = expand_normal (arg);
32033 enum machine_mode mode = insn_p->operand[i + 1].mode;
32034 bool match = insn_p->operand[i + 1].predicate (op, mode);
32036 if (last_arg_count && (i + 1) == nargs)
32038 /* SIMD shift insns take either an 8-bit immediate or
32039 register as count. But builtin functions take int as
32040 count. If count doesn't match, we put it in register. */
32041 if (!match)
32043 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32044 if (!insn_p->operand[i + 1].predicate (op, mode))
32045 op = copy_to_reg (op);
32048 else if ((nargs - i) <= nargs_constant)
32050 if (!match)
32051 switch (icode)
32053 case CODE_FOR_avx2_inserti128:
32054 case CODE_FOR_avx2_extracti128:
32055 error ("the last argument must be an 1-bit immediate");
32056 return const0_rtx;
32058 case CODE_FOR_sse4_1_roundsd:
32059 case CODE_FOR_sse4_1_roundss:
32061 case CODE_FOR_sse4_1_roundpd:
32062 case CODE_FOR_sse4_1_roundps:
32063 case CODE_FOR_avx_roundpd256:
32064 case CODE_FOR_avx_roundps256:
32066 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32067 case CODE_FOR_sse4_1_roundps_sfix:
32068 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32069 case CODE_FOR_avx_roundps_sfix256:
32071 case CODE_FOR_sse4_1_blendps:
32072 case CODE_FOR_avx_blendpd256:
32073 case CODE_FOR_avx_vpermilv4df:
32074 error ("the last argument must be a 4-bit immediate");
32075 return const0_rtx;
32077 case CODE_FOR_sse4_1_blendpd:
32078 case CODE_FOR_avx_vpermilv2df:
32079 case CODE_FOR_xop_vpermil2v2df3:
32080 case CODE_FOR_xop_vpermil2v4sf3:
32081 case CODE_FOR_xop_vpermil2v4df3:
32082 case CODE_FOR_xop_vpermil2v8sf3:
32083 error ("the last argument must be a 2-bit immediate");
32084 return const0_rtx;
32086 case CODE_FOR_avx_vextractf128v4df:
32087 case CODE_FOR_avx_vextractf128v8sf:
32088 case CODE_FOR_avx_vextractf128v8si:
32089 case CODE_FOR_avx_vinsertf128v4df:
32090 case CODE_FOR_avx_vinsertf128v8sf:
32091 case CODE_FOR_avx_vinsertf128v8si:
32092 error ("the last argument must be a 1-bit immediate");
32093 return const0_rtx;
32095 case CODE_FOR_avx_vmcmpv2df3:
32096 case CODE_FOR_avx_vmcmpv4sf3:
32097 case CODE_FOR_avx_cmpv2df3:
32098 case CODE_FOR_avx_cmpv4sf3:
32099 case CODE_FOR_avx_cmpv4df3:
32100 case CODE_FOR_avx_cmpv8sf3:
32101 error ("the last argument must be a 5-bit immediate");
32102 return const0_rtx;
32104 default:
32105 switch (nargs_constant)
32107 case 2:
32108 if ((nargs - i) == nargs_constant)
32110 error ("the next to last argument must be an 8-bit immediate");
32111 break;
32113 case 1:
32114 error ("the last argument must be an 8-bit immediate");
32115 break;
32116 default:
32117 gcc_unreachable ();
32119 return const0_rtx;
32122 else
32124 if (VECTOR_MODE_P (mode))
32125 op = safe_vector_operand (op, mode);
32127 /* If we aren't optimizing, only allow one memory operand to
32128 be generated. */
32129 if (memory_operand (op, mode))
32130 num_memory++;
32132 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32134 if (optimize || !match || num_memory > 1)
32135 op = copy_to_mode_reg (mode, op);
32137 else
32139 op = copy_to_reg (op);
32140 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32144 args[i].op = op;
32145 args[i].mode = mode;
32148 switch (nargs)
32150 case 1:
32151 pat = GEN_FCN (icode) (real_target, args[0].op);
32152 break;
32153 case 2:
32154 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32155 break;
32156 case 3:
32157 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32158 args[2].op);
32159 break;
32160 case 4:
32161 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32162 args[2].op, args[3].op);
32163 break;
32164 default:
32165 gcc_unreachable ();
32168 if (! pat)
32169 return 0;
32171 emit_insn (pat);
32172 return target;
32175 /* Subroutine of ix86_expand_builtin to take care of special insns
32176 with variable number of operands. */
32178 static rtx
32179 ix86_expand_special_args_builtin (const struct builtin_description *d,
32180 tree exp, rtx target)
32182 tree arg;
32183 rtx pat, op;
32184 unsigned int i, nargs, arg_adjust, memory;
32185 struct
32187 rtx op;
32188 enum machine_mode mode;
32189 } args[3];
32190 enum insn_code icode = d->icode;
32191 bool last_arg_constant = false;
32192 const struct insn_data_d *insn_p = &insn_data[icode];
32193 enum machine_mode tmode = insn_p->operand[0].mode;
32194 enum { load, store } klass;
32196 switch ((enum ix86_builtin_func_type) d->flag)
32198 case VOID_FTYPE_VOID:
32199 emit_insn (GEN_FCN (icode) (target));
32200 return 0;
32201 case VOID_FTYPE_UINT64:
32202 case VOID_FTYPE_UNSIGNED:
32203 nargs = 0;
32204 klass = store;
32205 memory = 0;
32206 break;
32208 case INT_FTYPE_VOID:
32209 case UINT64_FTYPE_VOID:
32210 case UNSIGNED_FTYPE_VOID:
32211 nargs = 0;
32212 klass = load;
32213 memory = 0;
32214 break;
32215 case UINT64_FTYPE_PUNSIGNED:
32216 case V2DI_FTYPE_PV2DI:
32217 case V4DI_FTYPE_PV4DI:
32218 case V32QI_FTYPE_PCCHAR:
32219 case V16QI_FTYPE_PCCHAR:
32220 case V8SF_FTYPE_PCV4SF:
32221 case V8SF_FTYPE_PCFLOAT:
32222 case V4SF_FTYPE_PCFLOAT:
32223 case V4DF_FTYPE_PCV2DF:
32224 case V4DF_FTYPE_PCDOUBLE:
32225 case V2DF_FTYPE_PCDOUBLE:
32226 case VOID_FTYPE_PVOID:
32227 nargs = 1;
32228 klass = load;
32229 memory = 0;
32230 break;
32231 case VOID_FTYPE_PV2SF_V4SF:
32232 case VOID_FTYPE_PV4DI_V4DI:
32233 case VOID_FTYPE_PV2DI_V2DI:
32234 case VOID_FTYPE_PCHAR_V32QI:
32235 case VOID_FTYPE_PCHAR_V16QI:
32236 case VOID_FTYPE_PFLOAT_V8SF:
32237 case VOID_FTYPE_PFLOAT_V4SF:
32238 case VOID_FTYPE_PDOUBLE_V4DF:
32239 case VOID_FTYPE_PDOUBLE_V2DF:
32240 case VOID_FTYPE_PLONGLONG_LONGLONG:
32241 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32242 case VOID_FTYPE_PINT_INT:
32243 nargs = 1;
32244 klass = store;
32245 /* Reserve memory operand for target. */
32246 memory = ARRAY_SIZE (args);
32247 break;
32248 case V4SF_FTYPE_V4SF_PCV2SF:
32249 case V2DF_FTYPE_V2DF_PCDOUBLE:
32250 nargs = 2;
32251 klass = load;
32252 memory = 1;
32253 break;
32254 case V8SF_FTYPE_PCV8SF_V8SI:
32255 case V4DF_FTYPE_PCV4DF_V4DI:
32256 case V4SF_FTYPE_PCV4SF_V4SI:
32257 case V2DF_FTYPE_PCV2DF_V2DI:
32258 case V8SI_FTYPE_PCV8SI_V8SI:
32259 case V4DI_FTYPE_PCV4DI_V4DI:
32260 case V4SI_FTYPE_PCV4SI_V4SI:
32261 case V2DI_FTYPE_PCV2DI_V2DI:
32262 nargs = 2;
32263 klass = load;
32264 memory = 0;
32265 break;
32266 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32267 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32268 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32269 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32270 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32271 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32272 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32273 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32274 nargs = 2;
32275 klass = store;
32276 /* Reserve memory operand for target. */
32277 memory = ARRAY_SIZE (args);
32278 break;
32279 case VOID_FTYPE_UINT_UINT_UINT:
32280 case VOID_FTYPE_UINT64_UINT_UINT:
32281 case UCHAR_FTYPE_UINT_UINT_UINT:
32282 case UCHAR_FTYPE_UINT64_UINT_UINT:
32283 nargs = 3;
32284 klass = load;
32285 memory = ARRAY_SIZE (args);
32286 last_arg_constant = true;
32287 break;
32288 default:
32289 gcc_unreachable ();
32292 gcc_assert (nargs <= ARRAY_SIZE (args));
32294 if (klass == store)
32296 arg = CALL_EXPR_ARG (exp, 0);
32297 op = expand_normal (arg);
32298 gcc_assert (target == 0);
32299 if (memory)
32301 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32302 target = gen_rtx_MEM (tmode, op);
32304 else
32305 target = force_reg (tmode, op);
32306 arg_adjust = 1;
32308 else
32310 arg_adjust = 0;
32311 if (optimize
32312 || target == 0
32313 || !register_operand (target, tmode)
32314 || GET_MODE (target) != tmode)
32315 target = gen_reg_rtx (tmode);
32318 for (i = 0; i < nargs; i++)
32320 enum machine_mode mode = insn_p->operand[i + 1].mode;
32321 bool match;
32323 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32324 op = expand_normal (arg);
32325 match = insn_p->operand[i + 1].predicate (op, mode);
32327 if (last_arg_constant && (i + 1) == nargs)
32329 if (!match)
32331 if (icode == CODE_FOR_lwp_lwpvalsi3
32332 || icode == CODE_FOR_lwp_lwpinssi3
32333 || icode == CODE_FOR_lwp_lwpvaldi3
32334 || icode == CODE_FOR_lwp_lwpinsdi3)
32335 error ("the last argument must be a 32-bit immediate");
32336 else
32337 error ("the last argument must be an 8-bit immediate");
32338 return const0_rtx;
32341 else
32343 if (i == memory)
32345 /* This must be the memory operand. */
32346 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32347 op = gen_rtx_MEM (mode, op);
32348 gcc_assert (GET_MODE (op) == mode
32349 || GET_MODE (op) == VOIDmode);
32351 else
32353 /* This must be register. */
32354 if (VECTOR_MODE_P (mode))
32355 op = safe_vector_operand (op, mode);
32357 gcc_assert (GET_MODE (op) == mode
32358 || GET_MODE (op) == VOIDmode);
32359 op = copy_to_mode_reg (mode, op);
32363 args[i].op = op;
32364 args[i].mode = mode;
32367 switch (nargs)
32369 case 0:
32370 pat = GEN_FCN (icode) (target);
32371 break;
32372 case 1:
32373 pat = GEN_FCN (icode) (target, args[0].op);
32374 break;
32375 case 2:
32376 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32377 break;
32378 case 3:
32379 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32380 break;
32381 default:
32382 gcc_unreachable ();
32385 if (! pat)
32386 return 0;
32387 emit_insn (pat);
32388 return klass == store ? 0 : target;
32391 /* Return the integer constant in ARG. Constrain it to be in the range
32392 of the subparts of VEC_TYPE; issue an error if not. */
32394 static int
32395 get_element_number (tree vec_type, tree arg)
32397 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32399 if (!host_integerp (arg, 1)
32400 || (elt = tree_low_cst (arg, 1), elt > max))
32402 error ("selector must be an integer constant in the range 0..%wi", max);
32403 return 0;
32406 return elt;
32409 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32410 ix86_expand_vector_init. We DO have language-level syntax for this, in
32411 the form of (type){ init-list }. Except that since we can't place emms
32412 instructions from inside the compiler, we can't allow the use of MMX
32413 registers unless the user explicitly asks for it. So we do *not* define
32414 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32415 we have builtins invoked by mmintrin.h that gives us license to emit
32416 these sorts of instructions. */
32418 static rtx
32419 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32421 enum machine_mode tmode = TYPE_MODE (type);
32422 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32423 int i, n_elt = GET_MODE_NUNITS (tmode);
32424 rtvec v = rtvec_alloc (n_elt);
32426 gcc_assert (VECTOR_MODE_P (tmode));
32427 gcc_assert (call_expr_nargs (exp) == n_elt);
32429 for (i = 0; i < n_elt; ++i)
32431 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32432 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32435 if (!target || !register_operand (target, tmode))
32436 target = gen_reg_rtx (tmode);
32438 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32439 return target;
32442 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32443 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32444 had a language-level syntax for referencing vector elements. */
32446 static rtx
32447 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32449 enum machine_mode tmode, mode0;
32450 tree arg0, arg1;
32451 int elt;
32452 rtx op0;
32454 arg0 = CALL_EXPR_ARG (exp, 0);
32455 arg1 = CALL_EXPR_ARG (exp, 1);
32457 op0 = expand_normal (arg0);
32458 elt = get_element_number (TREE_TYPE (arg0), arg1);
32460 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32461 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32462 gcc_assert (VECTOR_MODE_P (mode0));
32464 op0 = force_reg (mode0, op0);
32466 if (optimize || !target || !register_operand (target, tmode))
32467 target = gen_reg_rtx (tmode);
32469 ix86_expand_vector_extract (true, target, op0, elt);
32471 return target;
32474 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32475 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32476 a language-level syntax for referencing vector elements. */
32478 static rtx
32479 ix86_expand_vec_set_builtin (tree exp)
32481 enum machine_mode tmode, mode1;
32482 tree arg0, arg1, arg2;
32483 int elt;
32484 rtx op0, op1, target;
32486 arg0 = CALL_EXPR_ARG (exp, 0);
32487 arg1 = CALL_EXPR_ARG (exp, 1);
32488 arg2 = CALL_EXPR_ARG (exp, 2);
32490 tmode = TYPE_MODE (TREE_TYPE (arg0));
32491 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32492 gcc_assert (VECTOR_MODE_P (tmode));
32494 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32495 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32496 elt = get_element_number (TREE_TYPE (arg0), arg2);
32498 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32499 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32501 op0 = force_reg (tmode, op0);
32502 op1 = force_reg (mode1, op1);
32504 /* OP0 is the source of these builtin functions and shouldn't be
32505 modified. Create a copy, use it and return it as target. */
32506 target = gen_reg_rtx (tmode);
32507 emit_move_insn (target, op0);
32508 ix86_expand_vector_set (true, target, op1, elt);
32510 return target;
32513 /* Expand an expression EXP that calls a built-in function,
32514 with result going to TARGET if that's convenient
32515 (and in mode MODE if that's convenient).
32516 SUBTARGET may be used as the target for computing one of EXP's operands.
32517 IGNORE is nonzero if the value is to be ignored. */
32519 static rtx
32520 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32521 enum machine_mode mode, int ignore)
32523 const struct builtin_description *d;
32524 size_t i;
32525 enum insn_code icode;
32526 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32527 tree arg0, arg1, arg2, arg3, arg4;
32528 rtx op0, op1, op2, op3, op4, pat, insn;
32529 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32530 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32532 /* For CPU builtins that can be folded, fold first and expand the fold. */
32533 switch (fcode)
32535 case IX86_BUILTIN_CPU_INIT:
32537 /* Make it call __cpu_indicator_init in libgcc. */
32538 tree call_expr, fndecl, type;
32539 type = build_function_type_list (integer_type_node, NULL_TREE);
32540 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32541 call_expr = build_call_expr (fndecl, 0);
32542 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32544 case IX86_BUILTIN_CPU_IS:
32545 case IX86_BUILTIN_CPU_SUPPORTS:
32547 tree arg0 = CALL_EXPR_ARG (exp, 0);
32548 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32549 gcc_assert (fold_expr != NULL_TREE);
32550 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32554 /* Determine whether the builtin function is available under the current ISA.
32555 Originally the builtin was not created if it wasn't applicable to the
32556 current ISA based on the command line switches. With function specific
32557 options, we need to check in the context of the function making the call
32558 whether it is supported. */
32559 if (ix86_builtins_isa[fcode].isa
32560 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32562 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32563 NULL, (enum fpmath_unit) 0, false);
32565 if (!opts)
32566 error ("%qE needs unknown isa option", fndecl);
32567 else
32569 gcc_assert (opts != NULL);
32570 error ("%qE needs isa option %s", fndecl, opts);
32571 free (opts);
32573 return const0_rtx;
32576 switch (fcode)
32578 case IX86_BUILTIN_MASKMOVQ:
32579 case IX86_BUILTIN_MASKMOVDQU:
32580 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32581 ? CODE_FOR_mmx_maskmovq
32582 : CODE_FOR_sse2_maskmovdqu);
32583 /* Note the arg order is different from the operand order. */
32584 arg1 = CALL_EXPR_ARG (exp, 0);
32585 arg2 = CALL_EXPR_ARG (exp, 1);
32586 arg0 = CALL_EXPR_ARG (exp, 2);
32587 op0 = expand_normal (arg0);
32588 op1 = expand_normal (arg1);
32589 op2 = expand_normal (arg2);
32590 mode0 = insn_data[icode].operand[0].mode;
32591 mode1 = insn_data[icode].operand[1].mode;
32592 mode2 = insn_data[icode].operand[2].mode;
32594 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32595 op0 = gen_rtx_MEM (mode1, op0);
32597 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32598 op0 = copy_to_mode_reg (mode0, op0);
32599 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32600 op1 = copy_to_mode_reg (mode1, op1);
32601 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32602 op2 = copy_to_mode_reg (mode2, op2);
32603 pat = GEN_FCN (icode) (op0, op1, op2);
32604 if (! pat)
32605 return 0;
32606 emit_insn (pat);
32607 return 0;
32609 case IX86_BUILTIN_LDMXCSR:
32610 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32611 target = assign_386_stack_local (SImode, SLOT_TEMP);
32612 emit_move_insn (target, op0);
32613 emit_insn (gen_sse_ldmxcsr (target));
32614 return 0;
32616 case IX86_BUILTIN_STMXCSR:
32617 target = assign_386_stack_local (SImode, SLOT_TEMP);
32618 emit_insn (gen_sse_stmxcsr (target));
32619 return copy_to_mode_reg (SImode, target);
32621 case IX86_BUILTIN_CLFLUSH:
32622 arg0 = CALL_EXPR_ARG (exp, 0);
32623 op0 = expand_normal (arg0);
32624 icode = CODE_FOR_sse2_clflush;
32625 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32626 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32628 emit_insn (gen_sse2_clflush (op0));
32629 return 0;
32631 case IX86_BUILTIN_MONITOR:
32632 arg0 = CALL_EXPR_ARG (exp, 0);
32633 arg1 = CALL_EXPR_ARG (exp, 1);
32634 arg2 = CALL_EXPR_ARG (exp, 2);
32635 op0 = expand_normal (arg0);
32636 op1 = expand_normal (arg1);
32637 op2 = expand_normal (arg2);
32638 if (!REG_P (op0))
32639 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32640 if (!REG_P (op1))
32641 op1 = copy_to_mode_reg (SImode, op1);
32642 if (!REG_P (op2))
32643 op2 = copy_to_mode_reg (SImode, op2);
32644 emit_insn (ix86_gen_monitor (op0, op1, op2));
32645 return 0;
32647 case IX86_BUILTIN_MWAIT:
32648 arg0 = CALL_EXPR_ARG (exp, 0);
32649 arg1 = CALL_EXPR_ARG (exp, 1);
32650 op0 = expand_normal (arg0);
32651 op1 = expand_normal (arg1);
32652 if (!REG_P (op0))
32653 op0 = copy_to_mode_reg (SImode, op0);
32654 if (!REG_P (op1))
32655 op1 = copy_to_mode_reg (SImode, op1);
32656 emit_insn (gen_sse3_mwait (op0, op1));
32657 return 0;
32659 case IX86_BUILTIN_VEC_INIT_V2SI:
32660 case IX86_BUILTIN_VEC_INIT_V4HI:
32661 case IX86_BUILTIN_VEC_INIT_V8QI:
32662 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32664 case IX86_BUILTIN_VEC_EXT_V2DF:
32665 case IX86_BUILTIN_VEC_EXT_V2DI:
32666 case IX86_BUILTIN_VEC_EXT_V4SF:
32667 case IX86_BUILTIN_VEC_EXT_V4SI:
32668 case IX86_BUILTIN_VEC_EXT_V8HI:
32669 case IX86_BUILTIN_VEC_EXT_V2SI:
32670 case IX86_BUILTIN_VEC_EXT_V4HI:
32671 case IX86_BUILTIN_VEC_EXT_V16QI:
32672 return ix86_expand_vec_ext_builtin (exp, target);
32674 case IX86_BUILTIN_VEC_SET_V2DI:
32675 case IX86_BUILTIN_VEC_SET_V4SF:
32676 case IX86_BUILTIN_VEC_SET_V4SI:
32677 case IX86_BUILTIN_VEC_SET_V8HI:
32678 case IX86_BUILTIN_VEC_SET_V4HI:
32679 case IX86_BUILTIN_VEC_SET_V16QI:
32680 return ix86_expand_vec_set_builtin (exp);
32682 case IX86_BUILTIN_INFQ:
32683 case IX86_BUILTIN_HUGE_VALQ:
32685 REAL_VALUE_TYPE inf;
32686 rtx tmp;
32688 real_inf (&inf);
32689 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32691 tmp = validize_mem (force_const_mem (mode, tmp));
32693 if (target == 0)
32694 target = gen_reg_rtx (mode);
32696 emit_move_insn (target, tmp);
32697 return target;
32700 case IX86_BUILTIN_RDPMC:
32701 case IX86_BUILTIN_RDTSC:
32702 case IX86_BUILTIN_RDTSCP:
32704 op0 = gen_reg_rtx (DImode);
32705 op1 = gen_reg_rtx (DImode);
32707 if (fcode == IX86_BUILTIN_RDPMC)
32709 arg0 = CALL_EXPR_ARG (exp, 0);
32710 op2 = expand_normal (arg0);
32711 if (!register_operand (op2, SImode))
32712 op2 = copy_to_mode_reg (SImode, op2);
32714 insn = (TARGET_64BIT
32715 ? gen_rdpmc_rex64 (op0, op1, op2)
32716 : gen_rdpmc (op0, op2));
32717 emit_insn (insn);
32719 else if (fcode == IX86_BUILTIN_RDTSC)
32721 insn = (TARGET_64BIT
32722 ? gen_rdtsc_rex64 (op0, op1)
32723 : gen_rdtsc (op0));
32724 emit_insn (insn);
32726 else
32728 op2 = gen_reg_rtx (SImode);
32730 insn = (TARGET_64BIT
32731 ? gen_rdtscp_rex64 (op0, op1, op2)
32732 : gen_rdtscp (op0, op2));
32733 emit_insn (insn);
32735 arg0 = CALL_EXPR_ARG (exp, 0);
32736 op4 = expand_normal (arg0);
32737 if (!address_operand (op4, VOIDmode))
32739 op4 = convert_memory_address (Pmode, op4);
32740 op4 = copy_addr_to_reg (op4);
32742 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32745 if (target == 0)
32747 /* mode is VOIDmode if __builtin_rd* has been called
32748 without lhs. */
32749 if (mode == VOIDmode)
32750 return target;
32751 target = gen_reg_rtx (mode);
32754 if (TARGET_64BIT)
32756 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32757 op1, 1, OPTAB_DIRECT);
32758 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32759 op0, 1, OPTAB_DIRECT);
32762 emit_move_insn (target, op0);
32763 return target;
32765 case IX86_BUILTIN_FXSAVE:
32766 case IX86_BUILTIN_FXRSTOR:
32767 case IX86_BUILTIN_FXSAVE64:
32768 case IX86_BUILTIN_FXRSTOR64:
32769 switch (fcode)
32771 case IX86_BUILTIN_FXSAVE:
32772 icode = CODE_FOR_fxsave;
32773 break;
32774 case IX86_BUILTIN_FXRSTOR:
32775 icode = CODE_FOR_fxrstor;
32776 break;
32777 case IX86_BUILTIN_FXSAVE64:
32778 icode = CODE_FOR_fxsave64;
32779 break;
32780 case IX86_BUILTIN_FXRSTOR64:
32781 icode = CODE_FOR_fxrstor64;
32782 break;
32783 default:
32784 gcc_unreachable ();
32787 arg0 = CALL_EXPR_ARG (exp, 0);
32788 op0 = expand_normal (arg0);
32790 if (!address_operand (op0, VOIDmode))
32792 op0 = convert_memory_address (Pmode, op0);
32793 op0 = copy_addr_to_reg (op0);
32795 op0 = gen_rtx_MEM (BLKmode, op0);
32797 pat = GEN_FCN (icode) (op0);
32798 if (pat)
32799 emit_insn (pat);
32800 return 0;
32802 case IX86_BUILTIN_XSAVE:
32803 case IX86_BUILTIN_XRSTOR:
32804 case IX86_BUILTIN_XSAVE64:
32805 case IX86_BUILTIN_XRSTOR64:
32806 case IX86_BUILTIN_XSAVEOPT:
32807 case IX86_BUILTIN_XSAVEOPT64:
32808 arg0 = CALL_EXPR_ARG (exp, 0);
32809 arg1 = CALL_EXPR_ARG (exp, 1);
32810 op0 = expand_normal (arg0);
32811 op1 = expand_normal (arg1);
32813 if (!address_operand (op0, VOIDmode))
32815 op0 = convert_memory_address (Pmode, op0);
32816 op0 = copy_addr_to_reg (op0);
32818 op0 = gen_rtx_MEM (BLKmode, op0);
32820 op1 = force_reg (DImode, op1);
32822 if (TARGET_64BIT)
32824 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32825 NULL, 1, OPTAB_DIRECT);
32826 switch (fcode)
32828 case IX86_BUILTIN_XSAVE:
32829 icode = CODE_FOR_xsave_rex64;
32830 break;
32831 case IX86_BUILTIN_XRSTOR:
32832 icode = CODE_FOR_xrstor_rex64;
32833 break;
32834 case IX86_BUILTIN_XSAVE64:
32835 icode = CODE_FOR_xsave64;
32836 break;
32837 case IX86_BUILTIN_XRSTOR64:
32838 icode = CODE_FOR_xrstor64;
32839 break;
32840 case IX86_BUILTIN_XSAVEOPT:
32841 icode = CODE_FOR_xsaveopt_rex64;
32842 break;
32843 case IX86_BUILTIN_XSAVEOPT64:
32844 icode = CODE_FOR_xsaveopt64;
32845 break;
32846 default:
32847 gcc_unreachable ();
32850 op2 = gen_lowpart (SImode, op2);
32851 op1 = gen_lowpart (SImode, op1);
32852 pat = GEN_FCN (icode) (op0, op1, op2);
32854 else
32856 switch (fcode)
32858 case IX86_BUILTIN_XSAVE:
32859 icode = CODE_FOR_xsave;
32860 break;
32861 case IX86_BUILTIN_XRSTOR:
32862 icode = CODE_FOR_xrstor;
32863 break;
32864 case IX86_BUILTIN_XSAVEOPT:
32865 icode = CODE_FOR_xsaveopt;
32866 break;
32867 default:
32868 gcc_unreachable ();
32870 pat = GEN_FCN (icode) (op0, op1);
32873 if (pat)
32874 emit_insn (pat);
32875 return 0;
32877 case IX86_BUILTIN_LLWPCB:
32878 arg0 = CALL_EXPR_ARG (exp, 0);
32879 op0 = expand_normal (arg0);
32880 icode = CODE_FOR_lwp_llwpcb;
32881 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32882 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32883 emit_insn (gen_lwp_llwpcb (op0));
32884 return 0;
32886 case IX86_BUILTIN_SLWPCB:
32887 icode = CODE_FOR_lwp_slwpcb;
32888 if (!target
32889 || !insn_data[icode].operand[0].predicate (target, Pmode))
32890 target = gen_reg_rtx (Pmode);
32891 emit_insn (gen_lwp_slwpcb (target));
32892 return target;
32894 case IX86_BUILTIN_BEXTRI32:
32895 case IX86_BUILTIN_BEXTRI64:
32896 arg0 = CALL_EXPR_ARG (exp, 0);
32897 arg1 = CALL_EXPR_ARG (exp, 1);
32898 op0 = expand_normal (arg0);
32899 op1 = expand_normal (arg1);
32900 icode = (fcode == IX86_BUILTIN_BEXTRI32
32901 ? CODE_FOR_tbm_bextri_si
32902 : CODE_FOR_tbm_bextri_di);
32903 if (!CONST_INT_P (op1))
32905 error ("last argument must be an immediate");
32906 return const0_rtx;
32908 else
32910 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32911 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32912 op1 = GEN_INT (length);
32913 op2 = GEN_INT (lsb_index);
32914 pat = GEN_FCN (icode) (target, op0, op1, op2);
32915 if (pat)
32916 emit_insn (pat);
32917 return target;
32920 case IX86_BUILTIN_RDRAND16_STEP:
32921 icode = CODE_FOR_rdrandhi_1;
32922 mode0 = HImode;
32923 goto rdrand_step;
32925 case IX86_BUILTIN_RDRAND32_STEP:
32926 icode = CODE_FOR_rdrandsi_1;
32927 mode0 = SImode;
32928 goto rdrand_step;
32930 case IX86_BUILTIN_RDRAND64_STEP:
32931 icode = CODE_FOR_rdranddi_1;
32932 mode0 = DImode;
32934 rdrand_step:
32935 op0 = gen_reg_rtx (mode0);
32936 emit_insn (GEN_FCN (icode) (op0));
32938 arg0 = CALL_EXPR_ARG (exp, 0);
32939 op1 = expand_normal (arg0);
32940 if (!address_operand (op1, VOIDmode))
32942 op1 = convert_memory_address (Pmode, op1);
32943 op1 = copy_addr_to_reg (op1);
32945 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32947 op1 = gen_reg_rtx (SImode);
32948 emit_move_insn (op1, CONST1_RTX (SImode));
32950 /* Emit SImode conditional move. */
32951 if (mode0 == HImode)
32953 op2 = gen_reg_rtx (SImode);
32954 emit_insn (gen_zero_extendhisi2 (op2, op0));
32956 else if (mode0 == SImode)
32957 op2 = op0;
32958 else
32959 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32961 if (target == 0)
32962 target = gen_reg_rtx (SImode);
32964 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32965 const0_rtx);
32966 emit_insn (gen_rtx_SET (VOIDmode, target,
32967 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32968 return target;
32970 case IX86_BUILTIN_RDSEED16_STEP:
32971 icode = CODE_FOR_rdseedhi_1;
32972 mode0 = HImode;
32973 goto rdseed_step;
32975 case IX86_BUILTIN_RDSEED32_STEP:
32976 icode = CODE_FOR_rdseedsi_1;
32977 mode0 = SImode;
32978 goto rdseed_step;
32980 case IX86_BUILTIN_RDSEED64_STEP:
32981 icode = CODE_FOR_rdseeddi_1;
32982 mode0 = DImode;
32984 rdseed_step:
32985 op0 = gen_reg_rtx (mode0);
32986 emit_insn (GEN_FCN (icode) (op0));
32988 arg0 = CALL_EXPR_ARG (exp, 0);
32989 op1 = expand_normal (arg0);
32990 if (!address_operand (op1, VOIDmode))
32992 op1 = convert_memory_address (Pmode, op1);
32993 op1 = copy_addr_to_reg (op1);
32995 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32997 op2 = gen_reg_rtx (QImode);
32999 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33000 const0_rtx);
33001 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33003 if (target == 0)
33004 target = gen_reg_rtx (SImode);
33006 emit_insn (gen_zero_extendqisi2 (target, op2));
33007 return target;
33009 case IX86_BUILTIN_ADDCARRYX32:
33010 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33011 mode0 = SImode;
33012 goto addcarryx;
33014 case IX86_BUILTIN_ADDCARRYX64:
33015 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33016 mode0 = DImode;
33018 addcarryx:
33019 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33020 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33021 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33022 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33024 op0 = gen_reg_rtx (QImode);
33026 /* Generate CF from input operand. */
33027 op1 = expand_normal (arg0);
33028 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33029 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33031 /* Gen ADCX instruction to compute X+Y+CF. */
33032 op2 = expand_normal (arg1);
33033 op3 = expand_normal (arg2);
33035 if (!REG_P (op2))
33036 op2 = copy_to_mode_reg (mode0, op2);
33037 if (!REG_P (op3))
33038 op3 = copy_to_mode_reg (mode0, op3);
33040 op0 = gen_reg_rtx (mode0);
33042 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33043 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33044 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33046 /* Store the result. */
33047 op4 = expand_normal (arg3);
33048 if (!address_operand (op4, VOIDmode))
33050 op4 = convert_memory_address (Pmode, op4);
33051 op4 = copy_addr_to_reg (op4);
33053 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33055 /* Return current CF value. */
33056 if (target == 0)
33057 target = gen_reg_rtx (QImode);
33059 PUT_MODE (pat, QImode);
33060 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33061 return target;
33063 case IX86_BUILTIN_GATHERSIV2DF:
33064 icode = CODE_FOR_avx2_gathersiv2df;
33065 goto gather_gen;
33066 case IX86_BUILTIN_GATHERSIV4DF:
33067 icode = CODE_FOR_avx2_gathersiv4df;
33068 goto gather_gen;
33069 case IX86_BUILTIN_GATHERDIV2DF:
33070 icode = CODE_FOR_avx2_gatherdiv2df;
33071 goto gather_gen;
33072 case IX86_BUILTIN_GATHERDIV4DF:
33073 icode = CODE_FOR_avx2_gatherdiv4df;
33074 goto gather_gen;
33075 case IX86_BUILTIN_GATHERSIV4SF:
33076 icode = CODE_FOR_avx2_gathersiv4sf;
33077 goto gather_gen;
33078 case IX86_BUILTIN_GATHERSIV8SF:
33079 icode = CODE_FOR_avx2_gathersiv8sf;
33080 goto gather_gen;
33081 case IX86_BUILTIN_GATHERDIV4SF:
33082 icode = CODE_FOR_avx2_gatherdiv4sf;
33083 goto gather_gen;
33084 case IX86_BUILTIN_GATHERDIV8SF:
33085 icode = CODE_FOR_avx2_gatherdiv8sf;
33086 goto gather_gen;
33087 case IX86_BUILTIN_GATHERSIV2DI:
33088 icode = CODE_FOR_avx2_gathersiv2di;
33089 goto gather_gen;
33090 case IX86_BUILTIN_GATHERSIV4DI:
33091 icode = CODE_FOR_avx2_gathersiv4di;
33092 goto gather_gen;
33093 case IX86_BUILTIN_GATHERDIV2DI:
33094 icode = CODE_FOR_avx2_gatherdiv2di;
33095 goto gather_gen;
33096 case IX86_BUILTIN_GATHERDIV4DI:
33097 icode = CODE_FOR_avx2_gatherdiv4di;
33098 goto gather_gen;
33099 case IX86_BUILTIN_GATHERSIV4SI:
33100 icode = CODE_FOR_avx2_gathersiv4si;
33101 goto gather_gen;
33102 case IX86_BUILTIN_GATHERSIV8SI:
33103 icode = CODE_FOR_avx2_gathersiv8si;
33104 goto gather_gen;
33105 case IX86_BUILTIN_GATHERDIV4SI:
33106 icode = CODE_FOR_avx2_gatherdiv4si;
33107 goto gather_gen;
33108 case IX86_BUILTIN_GATHERDIV8SI:
33109 icode = CODE_FOR_avx2_gatherdiv8si;
33110 goto gather_gen;
33111 case IX86_BUILTIN_GATHERALTSIV4DF:
33112 icode = CODE_FOR_avx2_gathersiv4df;
33113 goto gather_gen;
33114 case IX86_BUILTIN_GATHERALTDIV8SF:
33115 icode = CODE_FOR_avx2_gatherdiv8sf;
33116 goto gather_gen;
33117 case IX86_BUILTIN_GATHERALTSIV4DI:
33118 icode = CODE_FOR_avx2_gathersiv4di;
33119 goto gather_gen;
33120 case IX86_BUILTIN_GATHERALTDIV8SI:
33121 icode = CODE_FOR_avx2_gatherdiv8si;
33122 goto gather_gen;
33124 gather_gen:
33125 arg0 = CALL_EXPR_ARG (exp, 0);
33126 arg1 = CALL_EXPR_ARG (exp, 1);
33127 arg2 = CALL_EXPR_ARG (exp, 2);
33128 arg3 = CALL_EXPR_ARG (exp, 3);
33129 arg4 = CALL_EXPR_ARG (exp, 4);
33130 op0 = expand_normal (arg0);
33131 op1 = expand_normal (arg1);
33132 op2 = expand_normal (arg2);
33133 op3 = expand_normal (arg3);
33134 op4 = expand_normal (arg4);
33135 /* Note the arg order is different from the operand order. */
33136 mode0 = insn_data[icode].operand[1].mode;
33137 mode2 = insn_data[icode].operand[3].mode;
33138 mode3 = insn_data[icode].operand[4].mode;
33139 mode4 = insn_data[icode].operand[5].mode;
33141 if (target == NULL_RTX
33142 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33143 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33144 else
33145 subtarget = target;
33147 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33148 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33150 rtx half = gen_reg_rtx (V4SImode);
33151 if (!nonimmediate_operand (op2, V8SImode))
33152 op2 = copy_to_mode_reg (V8SImode, op2);
33153 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33154 op2 = half;
33156 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33157 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33159 rtx (*gen) (rtx, rtx);
33160 rtx half = gen_reg_rtx (mode0);
33161 if (mode0 == V4SFmode)
33162 gen = gen_vec_extract_lo_v8sf;
33163 else
33164 gen = gen_vec_extract_lo_v8si;
33165 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33166 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33167 emit_insn (gen (half, op0));
33168 op0 = half;
33169 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33170 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33171 emit_insn (gen (half, op3));
33172 op3 = half;
33175 /* Force memory operand only with base register here. But we
33176 don't want to do it on memory operand for other builtin
33177 functions. */
33178 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
33180 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33181 op0 = copy_to_mode_reg (mode0, op0);
33182 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33183 op1 = copy_to_mode_reg (Pmode, op1);
33184 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33185 op2 = copy_to_mode_reg (mode2, op2);
33186 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33187 op3 = copy_to_mode_reg (mode3, op3);
33188 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33190 error ("last argument must be scale 1, 2, 4, 8");
33191 return const0_rtx;
33194 /* Optimize. If mask is known to have all high bits set,
33195 replace op0 with pc_rtx to signal that the instruction
33196 overwrites the whole destination and doesn't use its
33197 previous contents. */
33198 if (optimize)
33200 if (TREE_CODE (arg3) == VECTOR_CST)
33202 unsigned int negative = 0;
33203 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33205 tree cst = VECTOR_CST_ELT (arg3, i);
33206 if (TREE_CODE (cst) == INTEGER_CST
33207 && tree_int_cst_sign_bit (cst))
33208 negative++;
33209 else if (TREE_CODE (cst) == REAL_CST
33210 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33211 negative++;
33213 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33214 op0 = pc_rtx;
33216 else if (TREE_CODE (arg3) == SSA_NAME)
33218 /* Recognize also when mask is like:
33219 __v2df src = _mm_setzero_pd ();
33220 __v2df mask = _mm_cmpeq_pd (src, src);
33222 __v8sf src = _mm256_setzero_ps ();
33223 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33224 as that is a cheaper way to load all ones into
33225 a register than having to load a constant from
33226 memory. */
33227 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33228 if (is_gimple_call (def_stmt))
33230 tree fndecl = gimple_call_fndecl (def_stmt);
33231 if (fndecl
33232 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33233 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33235 case IX86_BUILTIN_CMPPD:
33236 case IX86_BUILTIN_CMPPS:
33237 case IX86_BUILTIN_CMPPD256:
33238 case IX86_BUILTIN_CMPPS256:
33239 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33240 break;
33241 /* FALLTHRU */
33242 case IX86_BUILTIN_CMPEQPD:
33243 case IX86_BUILTIN_CMPEQPS:
33244 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33245 && initializer_zerop (gimple_call_arg (def_stmt,
33246 1)))
33247 op0 = pc_rtx;
33248 break;
33249 default:
33250 break;
33256 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33257 if (! pat)
33258 return const0_rtx;
33259 emit_insn (pat);
33261 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33262 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33264 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33265 ? V4SFmode : V4SImode;
33266 if (target == NULL_RTX)
33267 target = gen_reg_rtx (tmode);
33268 if (tmode == V4SFmode)
33269 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33270 else
33271 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33273 else
33274 target = subtarget;
33276 return target;
33278 case IX86_BUILTIN_XABORT:
33279 icode = CODE_FOR_xabort;
33280 arg0 = CALL_EXPR_ARG (exp, 0);
33281 op0 = expand_normal (arg0);
33282 mode0 = insn_data[icode].operand[0].mode;
33283 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33285 error ("the xabort's argument must be an 8-bit immediate");
33286 return const0_rtx;
33288 emit_insn (gen_xabort (op0));
33289 return 0;
33291 default:
33292 break;
33295 for (i = 0, d = bdesc_special_args;
33296 i < ARRAY_SIZE (bdesc_special_args);
33297 i++, d++)
33298 if (d->code == fcode)
33299 return ix86_expand_special_args_builtin (d, exp, target);
33301 for (i = 0, d = bdesc_args;
33302 i < ARRAY_SIZE (bdesc_args);
33303 i++, d++)
33304 if (d->code == fcode)
33305 switch (fcode)
33307 case IX86_BUILTIN_FABSQ:
33308 case IX86_BUILTIN_COPYSIGNQ:
33309 if (!TARGET_SSE)
33310 /* Emit a normal call if SSE isn't available. */
33311 return expand_call (exp, target, ignore);
33312 default:
33313 return ix86_expand_args_builtin (d, exp, target);
33316 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33317 if (d->code == fcode)
33318 return ix86_expand_sse_comi (d, exp, target);
33320 for (i = 0, d = bdesc_pcmpestr;
33321 i < ARRAY_SIZE (bdesc_pcmpestr);
33322 i++, d++)
33323 if (d->code == fcode)
33324 return ix86_expand_sse_pcmpestr (d, exp, target);
33326 for (i = 0, d = bdesc_pcmpistr;
33327 i < ARRAY_SIZE (bdesc_pcmpistr);
33328 i++, d++)
33329 if (d->code == fcode)
33330 return ix86_expand_sse_pcmpistr (d, exp, target);
33332 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33333 if (d->code == fcode)
33334 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33335 (enum ix86_builtin_func_type)
33336 d->flag, d->comparison);
33338 gcc_unreachable ();
33341 /* Returns a function decl for a vectorized version of the builtin function
33342 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33343 if it is not available. */
33345 static tree
33346 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33347 tree type_in)
33349 enum machine_mode in_mode, out_mode;
33350 int in_n, out_n;
33351 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33353 if (TREE_CODE (type_out) != VECTOR_TYPE
33354 || TREE_CODE (type_in) != VECTOR_TYPE
33355 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33356 return NULL_TREE;
33358 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33359 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33360 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33361 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33363 switch (fn)
33365 case BUILT_IN_SQRT:
33366 if (out_mode == DFmode && in_mode == DFmode)
33368 if (out_n == 2 && in_n == 2)
33369 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33370 else if (out_n == 4 && in_n == 4)
33371 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33373 break;
33375 case BUILT_IN_SQRTF:
33376 if (out_mode == SFmode && in_mode == SFmode)
33378 if (out_n == 4 && in_n == 4)
33379 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33380 else if (out_n == 8 && in_n == 8)
33381 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33383 break;
33385 case BUILT_IN_IFLOOR:
33386 case BUILT_IN_LFLOOR:
33387 case BUILT_IN_LLFLOOR:
33388 /* The round insn does not trap on denormals. */
33389 if (flag_trapping_math || !TARGET_ROUND)
33390 break;
33392 if (out_mode == SImode && in_mode == DFmode)
33394 if (out_n == 4 && in_n == 2)
33395 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33396 else if (out_n == 8 && in_n == 4)
33397 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33399 break;
33401 case BUILT_IN_IFLOORF:
33402 case BUILT_IN_LFLOORF:
33403 case BUILT_IN_LLFLOORF:
33404 /* The round insn does not trap on denormals. */
33405 if (flag_trapping_math || !TARGET_ROUND)
33406 break;
33408 if (out_mode == SImode && in_mode == SFmode)
33410 if (out_n == 4 && in_n == 4)
33411 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33412 else if (out_n == 8 && in_n == 8)
33413 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33415 break;
33417 case BUILT_IN_ICEIL:
33418 case BUILT_IN_LCEIL:
33419 case BUILT_IN_LLCEIL:
33420 /* The round insn does not trap on denormals. */
33421 if (flag_trapping_math || !TARGET_ROUND)
33422 break;
33424 if (out_mode == SImode && in_mode == DFmode)
33426 if (out_n == 4 && in_n == 2)
33427 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33428 else if (out_n == 8 && in_n == 4)
33429 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33431 break;
33433 case BUILT_IN_ICEILF:
33434 case BUILT_IN_LCEILF:
33435 case BUILT_IN_LLCEILF:
33436 /* The round insn does not trap on denormals. */
33437 if (flag_trapping_math || !TARGET_ROUND)
33438 break;
33440 if (out_mode == SImode && in_mode == SFmode)
33442 if (out_n == 4 && in_n == 4)
33443 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33444 else if (out_n == 8 && in_n == 8)
33445 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33447 break;
33449 case BUILT_IN_IRINT:
33450 case BUILT_IN_LRINT:
33451 case BUILT_IN_LLRINT:
33452 if (out_mode == SImode && in_mode == DFmode)
33454 if (out_n == 4 && in_n == 2)
33455 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33456 else if (out_n == 8 && in_n == 4)
33457 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33459 break;
33461 case BUILT_IN_IRINTF:
33462 case BUILT_IN_LRINTF:
33463 case BUILT_IN_LLRINTF:
33464 if (out_mode == SImode && in_mode == SFmode)
33466 if (out_n == 4 && in_n == 4)
33467 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33468 else if (out_n == 8 && in_n == 8)
33469 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33471 break;
33473 case BUILT_IN_IROUND:
33474 case BUILT_IN_LROUND:
33475 case BUILT_IN_LLROUND:
33476 /* The round insn does not trap on denormals. */
33477 if (flag_trapping_math || !TARGET_ROUND)
33478 break;
33480 if (out_mode == SImode && in_mode == DFmode)
33482 if (out_n == 4 && in_n == 2)
33483 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33484 else if (out_n == 8 && in_n == 4)
33485 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33487 break;
33489 case BUILT_IN_IROUNDF:
33490 case BUILT_IN_LROUNDF:
33491 case BUILT_IN_LLROUNDF:
33492 /* The round insn does not trap on denormals. */
33493 if (flag_trapping_math || !TARGET_ROUND)
33494 break;
33496 if (out_mode == SImode && in_mode == SFmode)
33498 if (out_n == 4 && in_n == 4)
33499 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33500 else if (out_n == 8 && in_n == 8)
33501 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33503 break;
33505 case BUILT_IN_COPYSIGN:
33506 if (out_mode == DFmode && in_mode == DFmode)
33508 if (out_n == 2 && in_n == 2)
33509 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33510 else if (out_n == 4 && in_n == 4)
33511 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33513 break;
33515 case BUILT_IN_COPYSIGNF:
33516 if (out_mode == SFmode && in_mode == SFmode)
33518 if (out_n == 4 && in_n == 4)
33519 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33520 else if (out_n == 8 && in_n == 8)
33521 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33523 break;
33525 case BUILT_IN_FLOOR:
33526 /* The round insn does not trap on denormals. */
33527 if (flag_trapping_math || !TARGET_ROUND)
33528 break;
33530 if (out_mode == DFmode && in_mode == DFmode)
33532 if (out_n == 2 && in_n == 2)
33533 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33534 else if (out_n == 4 && in_n == 4)
33535 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33537 break;
33539 case BUILT_IN_FLOORF:
33540 /* The round insn does not trap on denormals. */
33541 if (flag_trapping_math || !TARGET_ROUND)
33542 break;
33544 if (out_mode == SFmode && in_mode == SFmode)
33546 if (out_n == 4 && in_n == 4)
33547 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33548 else if (out_n == 8 && in_n == 8)
33549 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33551 break;
33553 case BUILT_IN_CEIL:
33554 /* The round insn does not trap on denormals. */
33555 if (flag_trapping_math || !TARGET_ROUND)
33556 break;
33558 if (out_mode == DFmode && in_mode == DFmode)
33560 if (out_n == 2 && in_n == 2)
33561 return ix86_builtins[IX86_BUILTIN_CEILPD];
33562 else if (out_n == 4 && in_n == 4)
33563 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33565 break;
33567 case BUILT_IN_CEILF:
33568 /* The round insn does not trap on denormals. */
33569 if (flag_trapping_math || !TARGET_ROUND)
33570 break;
33572 if (out_mode == SFmode && in_mode == SFmode)
33574 if (out_n == 4 && in_n == 4)
33575 return ix86_builtins[IX86_BUILTIN_CEILPS];
33576 else if (out_n == 8 && in_n == 8)
33577 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33579 break;
33581 case BUILT_IN_TRUNC:
33582 /* The round insn does not trap on denormals. */
33583 if (flag_trapping_math || !TARGET_ROUND)
33584 break;
33586 if (out_mode == DFmode && in_mode == DFmode)
33588 if (out_n == 2 && in_n == 2)
33589 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33590 else if (out_n == 4 && in_n == 4)
33591 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33593 break;
33595 case BUILT_IN_TRUNCF:
33596 /* The round insn does not trap on denormals. */
33597 if (flag_trapping_math || !TARGET_ROUND)
33598 break;
33600 if (out_mode == SFmode && in_mode == SFmode)
33602 if (out_n == 4 && in_n == 4)
33603 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33604 else if (out_n == 8 && in_n == 8)
33605 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33607 break;
33609 case BUILT_IN_RINT:
33610 /* The round insn does not trap on denormals. */
33611 if (flag_trapping_math || !TARGET_ROUND)
33612 break;
33614 if (out_mode == DFmode && in_mode == DFmode)
33616 if (out_n == 2 && in_n == 2)
33617 return ix86_builtins[IX86_BUILTIN_RINTPD];
33618 else if (out_n == 4 && in_n == 4)
33619 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33621 break;
33623 case BUILT_IN_RINTF:
33624 /* The round insn does not trap on denormals. */
33625 if (flag_trapping_math || !TARGET_ROUND)
33626 break;
33628 if (out_mode == SFmode && in_mode == SFmode)
33630 if (out_n == 4 && in_n == 4)
33631 return ix86_builtins[IX86_BUILTIN_RINTPS];
33632 else if (out_n == 8 && in_n == 8)
33633 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33635 break;
33637 case BUILT_IN_ROUND:
33638 /* The round insn does not trap on denormals. */
33639 if (flag_trapping_math || !TARGET_ROUND)
33640 break;
33642 if (out_mode == DFmode && in_mode == DFmode)
33644 if (out_n == 2 && in_n == 2)
33645 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33646 else if (out_n == 4 && in_n == 4)
33647 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33649 break;
33651 case BUILT_IN_ROUNDF:
33652 /* The round insn does not trap on denormals. */
33653 if (flag_trapping_math || !TARGET_ROUND)
33654 break;
33656 if (out_mode == SFmode && in_mode == SFmode)
33658 if (out_n == 4 && in_n == 4)
33659 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33660 else if (out_n == 8 && in_n == 8)
33661 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33663 break;
33665 case BUILT_IN_FMA:
33666 if (out_mode == DFmode && in_mode == DFmode)
33668 if (out_n == 2 && in_n == 2)
33669 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33670 if (out_n == 4 && in_n == 4)
33671 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33673 break;
33675 case BUILT_IN_FMAF:
33676 if (out_mode == SFmode && in_mode == SFmode)
33678 if (out_n == 4 && in_n == 4)
33679 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33680 if (out_n == 8 && in_n == 8)
33681 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33683 break;
33685 default:
33686 break;
33689 /* Dispatch to a handler for a vectorization library. */
33690 if (ix86_veclib_handler)
33691 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33692 type_in);
33694 return NULL_TREE;
33697 /* Handler for an SVML-style interface to
33698 a library with vectorized intrinsics. */
33700 static tree
33701 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33703 char name[20];
33704 tree fntype, new_fndecl, args;
33705 unsigned arity;
33706 const char *bname;
33707 enum machine_mode el_mode, in_mode;
33708 int n, in_n;
33710 /* The SVML is suitable for unsafe math only. */
33711 if (!flag_unsafe_math_optimizations)
33712 return NULL_TREE;
33714 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33715 n = TYPE_VECTOR_SUBPARTS (type_out);
33716 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33717 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33718 if (el_mode != in_mode
33719 || n != in_n)
33720 return NULL_TREE;
33722 switch (fn)
33724 case BUILT_IN_EXP:
33725 case BUILT_IN_LOG:
33726 case BUILT_IN_LOG10:
33727 case BUILT_IN_POW:
33728 case BUILT_IN_TANH:
33729 case BUILT_IN_TAN:
33730 case BUILT_IN_ATAN:
33731 case BUILT_IN_ATAN2:
33732 case BUILT_IN_ATANH:
33733 case BUILT_IN_CBRT:
33734 case BUILT_IN_SINH:
33735 case BUILT_IN_SIN:
33736 case BUILT_IN_ASINH:
33737 case BUILT_IN_ASIN:
33738 case BUILT_IN_COSH:
33739 case BUILT_IN_COS:
33740 case BUILT_IN_ACOSH:
33741 case BUILT_IN_ACOS:
33742 if (el_mode != DFmode || n != 2)
33743 return NULL_TREE;
33744 break;
33746 case BUILT_IN_EXPF:
33747 case BUILT_IN_LOGF:
33748 case BUILT_IN_LOG10F:
33749 case BUILT_IN_POWF:
33750 case BUILT_IN_TANHF:
33751 case BUILT_IN_TANF:
33752 case BUILT_IN_ATANF:
33753 case BUILT_IN_ATAN2F:
33754 case BUILT_IN_ATANHF:
33755 case BUILT_IN_CBRTF:
33756 case BUILT_IN_SINHF:
33757 case BUILT_IN_SINF:
33758 case BUILT_IN_ASINHF:
33759 case BUILT_IN_ASINF:
33760 case BUILT_IN_COSHF:
33761 case BUILT_IN_COSF:
33762 case BUILT_IN_ACOSHF:
33763 case BUILT_IN_ACOSF:
33764 if (el_mode != SFmode || n != 4)
33765 return NULL_TREE;
33766 break;
33768 default:
33769 return NULL_TREE;
33772 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33774 if (fn == BUILT_IN_LOGF)
33775 strcpy (name, "vmlsLn4");
33776 else if (fn == BUILT_IN_LOG)
33777 strcpy (name, "vmldLn2");
33778 else if (n == 4)
33780 sprintf (name, "vmls%s", bname+10);
33781 name[strlen (name)-1] = '4';
33783 else
33784 sprintf (name, "vmld%s2", bname+10);
33786 /* Convert to uppercase. */
33787 name[4] &= ~0x20;
33789 arity = 0;
33790 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33791 args;
33792 args = TREE_CHAIN (args))
33793 arity++;
33795 if (arity == 1)
33796 fntype = build_function_type_list (type_out, type_in, NULL);
33797 else
33798 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33800 /* Build a function declaration for the vectorized function. */
33801 new_fndecl = build_decl (BUILTINS_LOCATION,
33802 FUNCTION_DECL, get_identifier (name), fntype);
33803 TREE_PUBLIC (new_fndecl) = 1;
33804 DECL_EXTERNAL (new_fndecl) = 1;
33805 DECL_IS_NOVOPS (new_fndecl) = 1;
33806 TREE_READONLY (new_fndecl) = 1;
33808 return new_fndecl;
33811 /* Handler for an ACML-style interface to
33812 a library with vectorized intrinsics. */
33814 static tree
33815 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33817 char name[20] = "__vr.._";
33818 tree fntype, new_fndecl, args;
33819 unsigned arity;
33820 const char *bname;
33821 enum machine_mode el_mode, in_mode;
33822 int n, in_n;
33824 /* The ACML is 64bits only and suitable for unsafe math only as
33825 it does not correctly support parts of IEEE with the required
33826 precision such as denormals. */
33827 if (!TARGET_64BIT
33828 || !flag_unsafe_math_optimizations)
33829 return NULL_TREE;
33831 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33832 n = TYPE_VECTOR_SUBPARTS (type_out);
33833 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33834 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33835 if (el_mode != in_mode
33836 || n != in_n)
33837 return NULL_TREE;
33839 switch (fn)
33841 case BUILT_IN_SIN:
33842 case BUILT_IN_COS:
33843 case BUILT_IN_EXP:
33844 case BUILT_IN_LOG:
33845 case BUILT_IN_LOG2:
33846 case BUILT_IN_LOG10:
33847 name[4] = 'd';
33848 name[5] = '2';
33849 if (el_mode != DFmode
33850 || n != 2)
33851 return NULL_TREE;
33852 break;
33854 case BUILT_IN_SINF:
33855 case BUILT_IN_COSF:
33856 case BUILT_IN_EXPF:
33857 case BUILT_IN_POWF:
33858 case BUILT_IN_LOGF:
33859 case BUILT_IN_LOG2F:
33860 case BUILT_IN_LOG10F:
33861 name[4] = 's';
33862 name[5] = '4';
33863 if (el_mode != SFmode
33864 || n != 4)
33865 return NULL_TREE;
33866 break;
33868 default:
33869 return NULL_TREE;
33872 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33873 sprintf (name + 7, "%s", bname+10);
33875 arity = 0;
33876 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33877 args;
33878 args = TREE_CHAIN (args))
33879 arity++;
33881 if (arity == 1)
33882 fntype = build_function_type_list (type_out, type_in, NULL);
33883 else
33884 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33886 /* Build a function declaration for the vectorized function. */
33887 new_fndecl = build_decl (BUILTINS_LOCATION,
33888 FUNCTION_DECL, get_identifier (name), fntype);
33889 TREE_PUBLIC (new_fndecl) = 1;
33890 DECL_EXTERNAL (new_fndecl) = 1;
33891 DECL_IS_NOVOPS (new_fndecl) = 1;
33892 TREE_READONLY (new_fndecl) = 1;
33894 return new_fndecl;
33897 /* Returns a decl of a function that implements gather load with
33898 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33899 Return NULL_TREE if it is not available. */
33901 static tree
33902 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33903 const_tree index_type, int scale)
33905 bool si;
33906 enum ix86_builtins code;
33908 if (! TARGET_AVX2)
33909 return NULL_TREE;
33911 if ((TREE_CODE (index_type) != INTEGER_TYPE
33912 && !POINTER_TYPE_P (index_type))
33913 || (TYPE_MODE (index_type) != SImode
33914 && TYPE_MODE (index_type) != DImode))
33915 return NULL_TREE;
33917 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33918 return NULL_TREE;
33920 /* v*gather* insn sign extends index to pointer mode. */
33921 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33922 && TYPE_UNSIGNED (index_type))
33923 return NULL_TREE;
33925 if (scale <= 0
33926 || scale > 8
33927 || (scale & (scale - 1)) != 0)
33928 return NULL_TREE;
33930 si = TYPE_MODE (index_type) == SImode;
33931 switch (TYPE_MODE (mem_vectype))
33933 case V2DFmode:
33934 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33935 break;
33936 case V4DFmode:
33937 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33938 break;
33939 case V2DImode:
33940 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33941 break;
33942 case V4DImode:
33943 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33944 break;
33945 case V4SFmode:
33946 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33947 break;
33948 case V8SFmode:
33949 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33950 break;
33951 case V4SImode:
33952 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33953 break;
33954 case V8SImode:
33955 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33956 break;
33957 default:
33958 return NULL_TREE;
33961 return ix86_builtins[code];
33964 /* Returns a code for a target-specific builtin that implements
33965 reciprocal of the function, or NULL_TREE if not available. */
33967 static tree
33968 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33969 bool sqrt ATTRIBUTE_UNUSED)
33971 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33972 && flag_finite_math_only && !flag_trapping_math
33973 && flag_unsafe_math_optimizations))
33974 return NULL_TREE;
33976 if (md_fn)
33977 /* Machine dependent builtins. */
33978 switch (fn)
33980 /* Vectorized version of sqrt to rsqrt conversion. */
33981 case IX86_BUILTIN_SQRTPS_NR:
33982 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33984 case IX86_BUILTIN_SQRTPS_NR256:
33985 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33987 default:
33988 return NULL_TREE;
33990 else
33991 /* Normal builtins. */
33992 switch (fn)
33994 /* Sqrt to rsqrt conversion. */
33995 case BUILT_IN_SQRTF:
33996 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33998 default:
33999 return NULL_TREE;
34003 /* Helper for avx_vpermilps256_operand et al. This is also used by
34004 the expansion functions to turn the parallel back into a mask.
34005 The return value is 0 for no match and the imm8+1 for a match. */
34008 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34010 unsigned i, nelt = GET_MODE_NUNITS (mode);
34011 unsigned mask = 0;
34012 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34014 if (XVECLEN (par, 0) != (int) nelt)
34015 return 0;
34017 /* Validate that all of the elements are constants, and not totally
34018 out of range. Copy the data into an integral array to make the
34019 subsequent checks easier. */
34020 for (i = 0; i < nelt; ++i)
34022 rtx er = XVECEXP (par, 0, i);
34023 unsigned HOST_WIDE_INT ei;
34025 if (!CONST_INT_P (er))
34026 return 0;
34027 ei = INTVAL (er);
34028 if (ei >= nelt)
34029 return 0;
34030 ipar[i] = ei;
34033 switch (mode)
34035 case V4DFmode:
34036 /* In the 256-bit DFmode case, we can only move elements within
34037 a 128-bit lane. */
34038 for (i = 0; i < 2; ++i)
34040 if (ipar[i] >= 2)
34041 return 0;
34042 mask |= ipar[i] << i;
34044 for (i = 2; i < 4; ++i)
34046 if (ipar[i] < 2)
34047 return 0;
34048 mask |= (ipar[i] - 2) << i;
34050 break;
34052 case V8SFmode:
34053 /* In the 256-bit SFmode case, we have full freedom of movement
34054 within the low 128-bit lane, but the high 128-bit lane must
34055 mirror the exact same pattern. */
34056 for (i = 0; i < 4; ++i)
34057 if (ipar[i] + 4 != ipar[i + 4])
34058 return 0;
34059 nelt = 4;
34060 /* FALLTHRU */
34062 case V2DFmode:
34063 case V4SFmode:
34064 /* In the 128-bit case, we've full freedom in the placement of
34065 the elements from the source operand. */
34066 for (i = 0; i < nelt; ++i)
34067 mask |= ipar[i] << (i * (nelt / 2));
34068 break;
34070 default:
34071 gcc_unreachable ();
34074 /* Make sure success has a non-zero value by adding one. */
34075 return mask + 1;
34078 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34079 the expansion functions to turn the parallel back into a mask.
34080 The return value is 0 for no match and the imm8+1 for a match. */
34083 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34085 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34086 unsigned mask = 0;
34087 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34089 if (XVECLEN (par, 0) != (int) nelt)
34090 return 0;
34092 /* Validate that all of the elements are constants, and not totally
34093 out of range. Copy the data into an integral array to make the
34094 subsequent checks easier. */
34095 for (i = 0; i < nelt; ++i)
34097 rtx er = XVECEXP (par, 0, i);
34098 unsigned HOST_WIDE_INT ei;
34100 if (!CONST_INT_P (er))
34101 return 0;
34102 ei = INTVAL (er);
34103 if (ei >= 2 * nelt)
34104 return 0;
34105 ipar[i] = ei;
34108 /* Validate that the halves of the permute are halves. */
34109 for (i = 0; i < nelt2 - 1; ++i)
34110 if (ipar[i] + 1 != ipar[i + 1])
34111 return 0;
34112 for (i = nelt2; i < nelt - 1; ++i)
34113 if (ipar[i] + 1 != ipar[i + 1])
34114 return 0;
34116 /* Reconstruct the mask. */
34117 for (i = 0; i < 2; ++i)
34119 unsigned e = ipar[i * nelt2];
34120 if (e % nelt2)
34121 return 0;
34122 e /= nelt2;
34123 mask |= e << (i * 4);
34126 /* Make sure success has a non-zero value by adding one. */
34127 return mask + 1;
34130 /* Store OPERAND to the memory after reload is completed. This means
34131 that we can't easily use assign_stack_local. */
34133 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34135 rtx result;
34137 gcc_assert (reload_completed);
34138 if (ix86_using_red_zone ())
34140 result = gen_rtx_MEM (mode,
34141 gen_rtx_PLUS (Pmode,
34142 stack_pointer_rtx,
34143 GEN_INT (-RED_ZONE_SIZE)));
34144 emit_move_insn (result, operand);
34146 else if (TARGET_64BIT)
34148 switch (mode)
34150 case HImode:
34151 case SImode:
34152 operand = gen_lowpart (DImode, operand);
34153 /* FALLTHRU */
34154 case DImode:
34155 emit_insn (
34156 gen_rtx_SET (VOIDmode,
34157 gen_rtx_MEM (DImode,
34158 gen_rtx_PRE_DEC (DImode,
34159 stack_pointer_rtx)),
34160 operand));
34161 break;
34162 default:
34163 gcc_unreachable ();
34165 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34167 else
34169 switch (mode)
34171 case DImode:
34173 rtx operands[2];
34174 split_double_mode (mode, &operand, 1, operands, operands + 1);
34175 emit_insn (
34176 gen_rtx_SET (VOIDmode,
34177 gen_rtx_MEM (SImode,
34178 gen_rtx_PRE_DEC (Pmode,
34179 stack_pointer_rtx)),
34180 operands[1]));
34181 emit_insn (
34182 gen_rtx_SET (VOIDmode,
34183 gen_rtx_MEM (SImode,
34184 gen_rtx_PRE_DEC (Pmode,
34185 stack_pointer_rtx)),
34186 operands[0]));
34188 break;
34189 case HImode:
34190 /* Store HImodes as SImodes. */
34191 operand = gen_lowpart (SImode, operand);
34192 /* FALLTHRU */
34193 case SImode:
34194 emit_insn (
34195 gen_rtx_SET (VOIDmode,
34196 gen_rtx_MEM (GET_MODE (operand),
34197 gen_rtx_PRE_DEC (SImode,
34198 stack_pointer_rtx)),
34199 operand));
34200 break;
34201 default:
34202 gcc_unreachable ();
34204 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34206 return result;
34209 /* Free operand from the memory. */
34210 void
34211 ix86_free_from_memory (enum machine_mode mode)
34213 if (!ix86_using_red_zone ())
34215 int size;
34217 if (mode == DImode || TARGET_64BIT)
34218 size = 8;
34219 else
34220 size = 4;
34221 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34222 to pop or add instruction if registers are available. */
34223 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34224 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34225 GEN_INT (size))));
34229 /* Return a register priority for hard reg REGNO. */
34230 static int
34231 ix86_register_priority (int hard_regno)
34233 /* ebp and r13 as the base always wants a displacement, r12 as the
34234 base always wants an index. So discourage their usage in an
34235 address. */
34236 if (hard_regno == R12_REG || hard_regno == R13_REG)
34237 return 0;
34238 if (hard_regno == BP_REG)
34239 return 1;
34240 /* New x86-64 int registers result in bigger code size. Discourage
34241 them. */
34242 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34243 return 2;
34244 /* New x86-64 SSE registers result in bigger code size. Discourage
34245 them. */
34246 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34247 return 2;
34248 /* Usage of AX register results in smaller code. Prefer it. */
34249 if (hard_regno == 0)
34250 return 4;
34251 return 3;
34254 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34256 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34257 QImode must go into class Q_REGS.
34258 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34259 movdf to do mem-to-mem moves through integer regs. */
34261 static reg_class_t
34262 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34264 enum machine_mode mode = GET_MODE (x);
34266 /* We're only allowed to return a subclass of CLASS. Many of the
34267 following checks fail for NO_REGS, so eliminate that early. */
34268 if (regclass == NO_REGS)
34269 return NO_REGS;
34271 /* All classes can load zeros. */
34272 if (x == CONST0_RTX (mode))
34273 return regclass;
34275 /* Force constants into memory if we are loading a (nonzero) constant into
34276 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34277 instructions to load from a constant. */
34278 if (CONSTANT_P (x)
34279 && (MAYBE_MMX_CLASS_P (regclass)
34280 || MAYBE_SSE_CLASS_P (regclass)
34281 || MAYBE_MASK_CLASS_P (regclass)))
34282 return NO_REGS;
34284 /* Prefer SSE regs only, if we can use them for math. */
34285 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34286 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34288 /* Floating-point constants need more complex checks. */
34289 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34291 /* General regs can load everything. */
34292 if (reg_class_subset_p (regclass, GENERAL_REGS))
34293 return regclass;
34295 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34296 zero above. We only want to wind up preferring 80387 registers if
34297 we plan on doing computation with them. */
34298 if (TARGET_80387
34299 && standard_80387_constant_p (x) > 0)
34301 /* Limit class to non-sse. */
34302 if (regclass == FLOAT_SSE_REGS)
34303 return FLOAT_REGS;
34304 if (regclass == FP_TOP_SSE_REGS)
34305 return FP_TOP_REG;
34306 if (regclass == FP_SECOND_SSE_REGS)
34307 return FP_SECOND_REG;
34308 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34309 return regclass;
34312 return NO_REGS;
34315 /* Generally when we see PLUS here, it's the function invariant
34316 (plus soft-fp const_int). Which can only be computed into general
34317 regs. */
34318 if (GET_CODE (x) == PLUS)
34319 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34321 /* QImode constants are easy to load, but non-constant QImode data
34322 must go into Q_REGS. */
34323 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34325 if (reg_class_subset_p (regclass, Q_REGS))
34326 return regclass;
34327 if (reg_class_subset_p (Q_REGS, regclass))
34328 return Q_REGS;
34329 return NO_REGS;
34332 return regclass;
34335 /* Discourage putting floating-point values in SSE registers unless
34336 SSE math is being used, and likewise for the 387 registers. */
34337 static reg_class_t
34338 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34340 enum machine_mode mode = GET_MODE (x);
34342 /* Restrict the output reload class to the register bank that we are doing
34343 math on. If we would like not to return a subset of CLASS, reject this
34344 alternative: if reload cannot do this, it will still use its choice. */
34345 mode = GET_MODE (x);
34346 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34347 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34349 if (X87_FLOAT_MODE_P (mode))
34351 if (regclass == FP_TOP_SSE_REGS)
34352 return FP_TOP_REG;
34353 else if (regclass == FP_SECOND_SSE_REGS)
34354 return FP_SECOND_REG;
34355 else
34356 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34359 return regclass;
34362 static reg_class_t
34363 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34364 enum machine_mode mode, secondary_reload_info *sri)
34366 /* Double-word spills from general registers to non-offsettable memory
34367 references (zero-extended addresses) require special handling. */
34368 if (TARGET_64BIT
34369 && MEM_P (x)
34370 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34371 && INTEGER_CLASS_P (rclass)
34372 && !offsettable_memref_p (x))
34374 sri->icode = (in_p
34375 ? CODE_FOR_reload_noff_load
34376 : CODE_FOR_reload_noff_store);
34377 /* Add the cost of moving address to a temporary. */
34378 sri->extra_cost = 1;
34380 return NO_REGS;
34383 /* QImode spills from non-QI registers require
34384 intermediate register on 32bit targets. */
34385 if (mode == QImode
34386 && (MAYBE_MASK_CLASS_P (rclass)
34387 || (!TARGET_64BIT && !in_p
34388 && INTEGER_CLASS_P (rclass)
34389 && MAYBE_NON_Q_CLASS_P (rclass))))
34391 int regno;
34393 if (REG_P (x))
34394 regno = REGNO (x);
34395 else
34396 regno = -1;
34398 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34399 regno = true_regnum (x);
34401 /* Return Q_REGS if the operand is in memory. */
34402 if (regno == -1)
34403 return Q_REGS;
34406 /* This condition handles corner case where an expression involving
34407 pointers gets vectorized. We're trying to use the address of a
34408 stack slot as a vector initializer.
34410 (set (reg:V2DI 74 [ vect_cst_.2 ])
34411 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34413 Eventually frame gets turned into sp+offset like this:
34415 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34416 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34417 (const_int 392 [0x188]))))
34419 That later gets turned into:
34421 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34422 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34423 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34425 We'll have the following reload recorded:
34427 Reload 0: reload_in (DI) =
34428 (plus:DI (reg/f:DI 7 sp)
34429 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34430 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34431 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34432 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34433 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34434 reload_reg_rtx: (reg:V2DI 22 xmm1)
34436 Which isn't going to work since SSE instructions can't handle scalar
34437 additions. Returning GENERAL_REGS forces the addition into integer
34438 register and reload can handle subsequent reloads without problems. */
34440 if (in_p && GET_CODE (x) == PLUS
34441 && SSE_CLASS_P (rclass)
34442 && SCALAR_INT_MODE_P (mode))
34443 return GENERAL_REGS;
34445 return NO_REGS;
34448 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34450 static bool
34451 ix86_class_likely_spilled_p (reg_class_t rclass)
34453 switch (rclass)
34455 case AREG:
34456 case DREG:
34457 case CREG:
34458 case BREG:
34459 case AD_REGS:
34460 case SIREG:
34461 case DIREG:
34462 case SSE_FIRST_REG:
34463 case FP_TOP_REG:
34464 case FP_SECOND_REG:
34465 case BND_REGS:
34466 return true;
34468 default:
34469 break;
34472 return false;
34475 /* If we are copying between general and FP registers, we need a memory
34476 location. The same is true for SSE and MMX registers.
34478 To optimize register_move_cost performance, allow inline variant.
34480 The macro can't work reliably when one of the CLASSES is class containing
34481 registers from multiple units (SSE, MMX, integer). We avoid this by never
34482 combining those units in single alternative in the machine description.
34483 Ensure that this constraint holds to avoid unexpected surprises.
34485 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34486 enforce these sanity checks. */
34488 static inline bool
34489 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34490 enum machine_mode mode, int strict)
34492 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34493 return false;
34494 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34495 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34496 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34497 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34498 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34499 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34501 gcc_assert (!strict || lra_in_progress);
34502 return true;
34505 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34506 return true;
34508 /* ??? This is a lie. We do have moves between mmx/general, and for
34509 mmx/sse2. But by saying we need secondary memory we discourage the
34510 register allocator from using the mmx registers unless needed. */
34511 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34512 return true;
34514 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34516 /* SSE1 doesn't have any direct moves from other classes. */
34517 if (!TARGET_SSE2)
34518 return true;
34520 /* If the target says that inter-unit moves are more expensive
34521 than moving through memory, then don't generate them. */
34522 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34523 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34524 return true;
34526 /* Between SSE and general, we have moves no larger than word size. */
34527 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34528 return true;
34531 return false;
34534 bool
34535 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34536 enum machine_mode mode, int strict)
34538 return inline_secondary_memory_needed (class1, class2, mode, strict);
34541 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34543 On the 80386, this is the size of MODE in words,
34544 except in the FP regs, where a single reg is always enough. */
34546 static unsigned char
34547 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34549 if (MAYBE_INTEGER_CLASS_P (rclass))
34551 if (mode == XFmode)
34552 return (TARGET_64BIT ? 2 : 3);
34553 else if (mode == XCmode)
34554 return (TARGET_64BIT ? 4 : 6);
34555 else
34556 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34558 else
34560 if (COMPLEX_MODE_P (mode))
34561 return 2;
34562 else
34563 return 1;
34567 /* Return true if the registers in CLASS cannot represent the change from
34568 modes FROM to TO. */
34570 bool
34571 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34572 enum reg_class regclass)
34574 if (from == to)
34575 return false;
34577 /* x87 registers can't do subreg at all, as all values are reformatted
34578 to extended precision. */
34579 if (MAYBE_FLOAT_CLASS_P (regclass))
34580 return true;
34582 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34584 /* Vector registers do not support QI or HImode loads. If we don't
34585 disallow a change to these modes, reload will assume it's ok to
34586 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34587 the vec_dupv4hi pattern. */
34588 if (GET_MODE_SIZE (from) < 4)
34589 return true;
34591 /* Vector registers do not support subreg with nonzero offsets, which
34592 are otherwise valid for integer registers. Since we can't see
34593 whether we have a nonzero offset from here, prohibit all
34594 nonparadoxical subregs changing size. */
34595 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34596 return true;
34599 return false;
34602 /* Return the cost of moving data of mode M between a
34603 register and memory. A value of 2 is the default; this cost is
34604 relative to those in `REGISTER_MOVE_COST'.
34606 This function is used extensively by register_move_cost that is used to
34607 build tables at startup. Make it inline in this case.
34608 When IN is 2, return maximum of in and out move cost.
34610 If moving between registers and memory is more expensive than
34611 between two registers, you should define this macro to express the
34612 relative cost.
34614 Model also increased moving costs of QImode registers in non
34615 Q_REGS classes.
34617 static inline int
34618 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34619 int in)
34621 int cost;
34622 if (FLOAT_CLASS_P (regclass))
34624 int index;
34625 switch (mode)
34627 case SFmode:
34628 index = 0;
34629 break;
34630 case DFmode:
34631 index = 1;
34632 break;
34633 case XFmode:
34634 index = 2;
34635 break;
34636 default:
34637 return 100;
34639 if (in == 2)
34640 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34641 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34643 if (SSE_CLASS_P (regclass))
34645 int index;
34646 switch (GET_MODE_SIZE (mode))
34648 case 4:
34649 index = 0;
34650 break;
34651 case 8:
34652 index = 1;
34653 break;
34654 case 16:
34655 index = 2;
34656 break;
34657 default:
34658 return 100;
34660 if (in == 2)
34661 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34662 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34664 if (MMX_CLASS_P (regclass))
34666 int index;
34667 switch (GET_MODE_SIZE (mode))
34669 case 4:
34670 index = 0;
34671 break;
34672 case 8:
34673 index = 1;
34674 break;
34675 default:
34676 return 100;
34678 if (in)
34679 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34680 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34682 switch (GET_MODE_SIZE (mode))
34684 case 1:
34685 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34687 if (!in)
34688 return ix86_cost->int_store[0];
34689 if (TARGET_PARTIAL_REG_DEPENDENCY
34690 && optimize_function_for_speed_p (cfun))
34691 cost = ix86_cost->movzbl_load;
34692 else
34693 cost = ix86_cost->int_load[0];
34694 if (in == 2)
34695 return MAX (cost, ix86_cost->int_store[0]);
34696 return cost;
34698 else
34700 if (in == 2)
34701 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34702 if (in)
34703 return ix86_cost->movzbl_load;
34704 else
34705 return ix86_cost->int_store[0] + 4;
34707 break;
34708 case 2:
34709 if (in == 2)
34710 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34711 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34712 default:
34713 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34714 if (mode == TFmode)
34715 mode = XFmode;
34716 if (in == 2)
34717 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34718 else if (in)
34719 cost = ix86_cost->int_load[2];
34720 else
34721 cost = ix86_cost->int_store[2];
34722 return (cost * (((int) GET_MODE_SIZE (mode)
34723 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34727 static int
34728 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34729 bool in)
34731 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34735 /* Return the cost of moving data from a register in class CLASS1 to
34736 one in class CLASS2.
34738 It is not required that the cost always equal 2 when FROM is the same as TO;
34739 on some machines it is expensive to move between registers if they are not
34740 general registers. */
34742 static int
34743 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34744 reg_class_t class2_i)
34746 enum reg_class class1 = (enum reg_class) class1_i;
34747 enum reg_class class2 = (enum reg_class) class2_i;
34749 /* In case we require secondary memory, compute cost of the store followed
34750 by load. In order to avoid bad register allocation choices, we need
34751 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34753 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34755 int cost = 1;
34757 cost += inline_memory_move_cost (mode, class1, 2);
34758 cost += inline_memory_move_cost (mode, class2, 2);
34760 /* In case of copying from general_purpose_register we may emit multiple
34761 stores followed by single load causing memory size mismatch stall.
34762 Count this as arbitrarily high cost of 20. */
34763 if (targetm.class_max_nregs (class1, mode)
34764 > targetm.class_max_nregs (class2, mode))
34765 cost += 20;
34767 /* In the case of FP/MMX moves, the registers actually overlap, and we
34768 have to switch modes in order to treat them differently. */
34769 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34770 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34771 cost += 20;
34773 return cost;
34776 /* Moves between SSE/MMX and integer unit are expensive. */
34777 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34778 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34780 /* ??? By keeping returned value relatively high, we limit the number
34781 of moves between integer and MMX/SSE registers for all targets.
34782 Additionally, high value prevents problem with x86_modes_tieable_p(),
34783 where integer modes in MMX/SSE registers are not tieable
34784 because of missing QImode and HImode moves to, from or between
34785 MMX/SSE registers. */
34786 return MAX (8, ix86_cost->mmxsse_to_integer);
34788 if (MAYBE_FLOAT_CLASS_P (class1))
34789 return ix86_cost->fp_move;
34790 if (MAYBE_SSE_CLASS_P (class1))
34791 return ix86_cost->sse_move;
34792 if (MAYBE_MMX_CLASS_P (class1))
34793 return ix86_cost->mmx_move;
34794 return 2;
34797 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34798 MODE. */
34800 bool
34801 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34803 /* Flags and only flags can only hold CCmode values. */
34804 if (CC_REGNO_P (regno))
34805 return GET_MODE_CLASS (mode) == MODE_CC;
34806 if (GET_MODE_CLASS (mode) == MODE_CC
34807 || GET_MODE_CLASS (mode) == MODE_RANDOM
34808 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34809 return false;
34810 if (STACK_REGNO_P (regno))
34811 return VALID_FP_MODE_P (mode);
34812 if (MASK_REGNO_P (regno))
34813 return VALID_MASK_REG_MODE (mode);
34814 if (BND_REGNO_P (regno))
34815 return VALID_BND_REG_MODE (mode);
34816 if (SSE_REGNO_P (regno))
34818 /* We implement the move patterns for all vector modes into and
34819 out of SSE registers, even when no operation instructions
34820 are available. */
34822 /* For AVX-512 we allow, regardless of regno:
34823 - XI mode
34824 - any of 512-bit wide vector mode
34825 - any scalar mode. */
34826 if (TARGET_AVX512F
34827 && (mode == XImode
34828 || VALID_AVX512F_REG_MODE (mode)
34829 || VALID_AVX512F_SCALAR_MODE (mode)))
34830 return true;
34832 /* xmm16-xmm31 are only available for AVX-512. */
34833 if (EXT_REX_SSE_REGNO_P (regno))
34834 return false;
34836 /* OImode move is available only when AVX is enabled. */
34837 return ((TARGET_AVX && mode == OImode)
34838 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34839 || VALID_SSE_REG_MODE (mode)
34840 || VALID_SSE2_REG_MODE (mode)
34841 || VALID_MMX_REG_MODE (mode)
34842 || VALID_MMX_REG_MODE_3DNOW (mode));
34844 if (MMX_REGNO_P (regno))
34846 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34847 so if the register is available at all, then we can move data of
34848 the given mode into or out of it. */
34849 return (VALID_MMX_REG_MODE (mode)
34850 || VALID_MMX_REG_MODE_3DNOW (mode));
34853 if (mode == QImode)
34855 /* Take care for QImode values - they can be in non-QI regs,
34856 but then they do cause partial register stalls. */
34857 if (ANY_QI_REGNO_P (regno))
34858 return true;
34859 if (!TARGET_PARTIAL_REG_STALL)
34860 return true;
34861 /* LRA checks if the hard register is OK for the given mode.
34862 QImode values can live in non-QI regs, so we allow all
34863 registers here. */
34864 if (lra_in_progress)
34865 return true;
34866 return !can_create_pseudo_p ();
34868 /* We handle both integer and floats in the general purpose registers. */
34869 else if (VALID_INT_MODE_P (mode))
34870 return true;
34871 else if (VALID_FP_MODE_P (mode))
34872 return true;
34873 else if (VALID_DFP_MODE_P (mode))
34874 return true;
34875 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
34876 on to use that value in smaller contexts, this can easily force a
34877 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
34878 supporting DImode, allow it. */
34879 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
34880 return true;
34882 return false;
34885 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
34886 tieable integer mode. */
34888 static bool
34889 ix86_tieable_integer_mode_p (enum machine_mode mode)
34891 switch (mode)
34893 case HImode:
34894 case SImode:
34895 return true;
34897 case QImode:
34898 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
34900 case DImode:
34901 return TARGET_64BIT;
34903 default:
34904 return false;
34908 /* Return true if MODE1 is accessible in a register that can hold MODE2
34909 without copying. That is, all register classes that can hold MODE2
34910 can also hold MODE1. */
34912 bool
34913 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34915 if (mode1 == mode2)
34916 return true;
34918 if (ix86_tieable_integer_mode_p (mode1)
34919 && ix86_tieable_integer_mode_p (mode2))
34920 return true;
34922 /* MODE2 being XFmode implies fp stack or general regs, which means we
34923 can tie any smaller floating point modes to it. Note that we do not
34924 tie this with TFmode. */
34925 if (mode2 == XFmode)
34926 return mode1 == SFmode || mode1 == DFmode;
34928 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34929 that we can tie it with SFmode. */
34930 if (mode2 == DFmode)
34931 return mode1 == SFmode;
34933 /* If MODE2 is only appropriate for an SSE register, then tie with
34934 any other mode acceptable to SSE registers. */
34935 if (GET_MODE_SIZE (mode2) == 32
34936 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34937 return (GET_MODE_SIZE (mode1) == 32
34938 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34939 if (GET_MODE_SIZE (mode2) == 16
34940 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34941 return (GET_MODE_SIZE (mode1) == 16
34942 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34944 /* If MODE2 is appropriate for an MMX register, then tie
34945 with any other mode acceptable to MMX registers. */
34946 if (GET_MODE_SIZE (mode2) == 8
34947 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34948 return (GET_MODE_SIZE (mode1) == 8
34949 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34951 return false;
34954 /* Return the cost of moving between two registers of mode MODE. */
34956 static int
34957 ix86_set_reg_reg_cost (enum machine_mode mode)
34959 unsigned int units = UNITS_PER_WORD;
34961 switch (GET_MODE_CLASS (mode))
34963 default:
34964 break;
34966 case MODE_CC:
34967 units = GET_MODE_SIZE (CCmode);
34968 break;
34970 case MODE_FLOAT:
34971 if ((TARGET_SSE && mode == TFmode)
34972 || (TARGET_80387 && mode == XFmode)
34973 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34974 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34975 units = GET_MODE_SIZE (mode);
34976 break;
34978 case MODE_COMPLEX_FLOAT:
34979 if ((TARGET_SSE && mode == TCmode)
34980 || (TARGET_80387 && mode == XCmode)
34981 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34982 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34983 units = GET_MODE_SIZE (mode);
34984 break;
34986 case MODE_VECTOR_INT:
34987 case MODE_VECTOR_FLOAT:
34988 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
34989 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34990 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34991 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34992 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34993 units = GET_MODE_SIZE (mode);
34996 /* Return the cost of moving between two registers of mode MODE,
34997 assuming that the move will be in pieces of at most UNITS bytes. */
34998 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35001 /* Compute a (partial) cost for rtx X. Return true if the complete
35002 cost has been computed, and false if subexpressions should be
35003 scanned. In either case, *TOTAL contains the cost result. */
35005 static bool
35006 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35007 bool speed)
35009 enum rtx_code code = (enum rtx_code) code_i;
35010 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35011 enum machine_mode mode = GET_MODE (x);
35012 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35014 switch (code)
35016 case SET:
35017 if (register_operand (SET_DEST (x), VOIDmode)
35018 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35020 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35021 return true;
35023 return false;
35025 case CONST_INT:
35026 case CONST:
35027 case LABEL_REF:
35028 case SYMBOL_REF:
35029 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35030 *total = 3;
35031 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35032 *total = 2;
35033 else if (flag_pic && SYMBOLIC_CONST (x)
35034 && (!TARGET_64BIT
35035 || (!GET_CODE (x) != LABEL_REF
35036 && (GET_CODE (x) != SYMBOL_REF
35037 || !SYMBOL_REF_LOCAL_P (x)))))
35038 *total = 1;
35039 else
35040 *total = 0;
35041 return true;
35043 case CONST_DOUBLE:
35044 if (mode == VOIDmode)
35046 *total = 0;
35047 return true;
35049 switch (standard_80387_constant_p (x))
35051 case 1: /* 0.0 */
35052 *total = 1;
35053 return true;
35054 default: /* Other constants */
35055 *total = 2;
35056 return true;
35057 case 0:
35058 case -1:
35059 break;
35061 if (SSE_FLOAT_MODE_P (mode))
35063 case CONST_VECTOR:
35064 switch (standard_sse_constant_p (x))
35066 case 0:
35067 break;
35068 case 1: /* 0: xor eliminates false dependency */
35069 *total = 0;
35070 return true;
35071 default: /* -1: cmp contains false dependency */
35072 *total = 1;
35073 return true;
35076 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35077 it'll probably end up. Add a penalty for size. */
35078 *total = (COSTS_N_INSNS (1)
35079 + (flag_pic != 0 && !TARGET_64BIT)
35080 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35081 return true;
35083 case ZERO_EXTEND:
35084 /* The zero extensions is often completely free on x86_64, so make
35085 it as cheap as possible. */
35086 if (TARGET_64BIT && mode == DImode
35087 && GET_MODE (XEXP (x, 0)) == SImode)
35088 *total = 1;
35089 else if (TARGET_ZERO_EXTEND_WITH_AND)
35090 *total = cost->add;
35091 else
35092 *total = cost->movzx;
35093 return false;
35095 case SIGN_EXTEND:
35096 *total = cost->movsx;
35097 return false;
35099 case ASHIFT:
35100 if (SCALAR_INT_MODE_P (mode)
35101 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35102 && CONST_INT_P (XEXP (x, 1)))
35104 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35105 if (value == 1)
35107 *total = cost->add;
35108 return false;
35110 if ((value == 2 || value == 3)
35111 && cost->lea <= cost->shift_const)
35113 *total = cost->lea;
35114 return false;
35117 /* FALLTHRU */
35119 case ROTATE:
35120 case ASHIFTRT:
35121 case LSHIFTRT:
35122 case ROTATERT:
35123 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35125 /* ??? Should be SSE vector operation cost. */
35126 /* At least for published AMD latencies, this really is the same
35127 as the latency for a simple fpu operation like fabs. */
35128 /* V*QImode is emulated with 1-11 insns. */
35129 if (mode == V16QImode || mode == V32QImode)
35131 int count = 11;
35132 if (TARGET_XOP && mode == V16QImode)
35134 /* For XOP we use vpshab, which requires a broadcast of the
35135 value to the variable shift insn. For constants this
35136 means a V16Q const in mem; even when we can perform the
35137 shift with one insn set the cost to prefer paddb. */
35138 if (CONSTANT_P (XEXP (x, 1)))
35140 *total = (cost->fabs
35141 + rtx_cost (XEXP (x, 0), code, 0, speed)
35142 + (speed ? 2 : COSTS_N_BYTES (16)));
35143 return true;
35145 count = 3;
35147 else if (TARGET_SSSE3)
35148 count = 7;
35149 *total = cost->fabs * count;
35151 else
35152 *total = cost->fabs;
35154 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35156 if (CONST_INT_P (XEXP (x, 1)))
35158 if (INTVAL (XEXP (x, 1)) > 32)
35159 *total = cost->shift_const + COSTS_N_INSNS (2);
35160 else
35161 *total = cost->shift_const * 2;
35163 else
35165 if (GET_CODE (XEXP (x, 1)) == AND)
35166 *total = cost->shift_var * 2;
35167 else
35168 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35171 else
35173 if (CONST_INT_P (XEXP (x, 1)))
35174 *total = cost->shift_const;
35175 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35176 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35178 /* Return the cost after shift-and truncation. */
35179 *total = cost->shift_var;
35180 return true;
35182 else
35183 *total = cost->shift_var;
35185 return false;
35187 case FMA:
35189 rtx sub;
35191 gcc_assert (FLOAT_MODE_P (mode));
35192 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35194 /* ??? SSE scalar/vector cost should be used here. */
35195 /* ??? Bald assumption that fma has the same cost as fmul. */
35196 *total = cost->fmul;
35197 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35199 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35200 sub = XEXP (x, 0);
35201 if (GET_CODE (sub) == NEG)
35202 sub = XEXP (sub, 0);
35203 *total += rtx_cost (sub, FMA, 0, speed);
35205 sub = XEXP (x, 2);
35206 if (GET_CODE (sub) == NEG)
35207 sub = XEXP (sub, 0);
35208 *total += rtx_cost (sub, FMA, 2, speed);
35209 return true;
35212 case MULT:
35213 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35215 /* ??? SSE scalar cost should be used here. */
35216 *total = cost->fmul;
35217 return false;
35219 else if (X87_FLOAT_MODE_P (mode))
35221 *total = cost->fmul;
35222 return false;
35224 else if (FLOAT_MODE_P (mode))
35226 /* ??? SSE vector cost should be used here. */
35227 *total = cost->fmul;
35228 return false;
35230 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35232 /* V*QImode is emulated with 7-13 insns. */
35233 if (mode == V16QImode || mode == V32QImode)
35235 int extra = 11;
35236 if (TARGET_XOP && mode == V16QImode)
35237 extra = 5;
35238 else if (TARGET_SSSE3)
35239 extra = 6;
35240 *total = cost->fmul * 2 + cost->fabs * extra;
35242 /* V*DImode is emulated with 5-8 insns. */
35243 else if (mode == V2DImode || mode == V4DImode)
35245 if (TARGET_XOP && mode == V2DImode)
35246 *total = cost->fmul * 2 + cost->fabs * 3;
35247 else
35248 *total = cost->fmul * 3 + cost->fabs * 5;
35250 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35251 insns, including two PMULUDQ. */
35252 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35253 *total = cost->fmul * 2 + cost->fabs * 5;
35254 else
35255 *total = cost->fmul;
35256 return false;
35258 else
35260 rtx op0 = XEXP (x, 0);
35261 rtx op1 = XEXP (x, 1);
35262 int nbits;
35263 if (CONST_INT_P (XEXP (x, 1)))
35265 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35266 for (nbits = 0; value != 0; value &= value - 1)
35267 nbits++;
35269 else
35270 /* This is arbitrary. */
35271 nbits = 7;
35273 /* Compute costs correctly for widening multiplication. */
35274 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35275 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35276 == GET_MODE_SIZE (mode))
35278 int is_mulwiden = 0;
35279 enum machine_mode inner_mode = GET_MODE (op0);
35281 if (GET_CODE (op0) == GET_CODE (op1))
35282 is_mulwiden = 1, op1 = XEXP (op1, 0);
35283 else if (CONST_INT_P (op1))
35285 if (GET_CODE (op0) == SIGN_EXTEND)
35286 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35287 == INTVAL (op1);
35288 else
35289 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35292 if (is_mulwiden)
35293 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35296 *total = (cost->mult_init[MODE_INDEX (mode)]
35297 + nbits * cost->mult_bit
35298 + rtx_cost (op0, outer_code, opno, speed)
35299 + rtx_cost (op1, outer_code, opno, speed));
35301 return true;
35304 case DIV:
35305 case UDIV:
35306 case MOD:
35307 case UMOD:
35308 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35309 /* ??? SSE cost should be used here. */
35310 *total = cost->fdiv;
35311 else if (X87_FLOAT_MODE_P (mode))
35312 *total = cost->fdiv;
35313 else if (FLOAT_MODE_P (mode))
35314 /* ??? SSE vector cost should be used here. */
35315 *total = cost->fdiv;
35316 else
35317 *total = cost->divide[MODE_INDEX (mode)];
35318 return false;
35320 case PLUS:
35321 if (GET_MODE_CLASS (mode) == MODE_INT
35322 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35324 if (GET_CODE (XEXP (x, 0)) == PLUS
35325 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35326 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35327 && CONSTANT_P (XEXP (x, 1)))
35329 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35330 if (val == 2 || val == 4 || val == 8)
35332 *total = cost->lea;
35333 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35334 outer_code, opno, speed);
35335 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35336 outer_code, opno, speed);
35337 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35338 return true;
35341 else if (GET_CODE (XEXP (x, 0)) == MULT
35342 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35344 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35345 if (val == 2 || val == 4 || val == 8)
35347 *total = cost->lea;
35348 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35349 outer_code, opno, speed);
35350 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35351 return true;
35354 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35356 *total = cost->lea;
35357 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35358 outer_code, opno, speed);
35359 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35360 outer_code, opno, speed);
35361 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35362 return true;
35365 /* FALLTHRU */
35367 case MINUS:
35368 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35370 /* ??? SSE cost should be used here. */
35371 *total = cost->fadd;
35372 return false;
35374 else if (X87_FLOAT_MODE_P (mode))
35376 *total = cost->fadd;
35377 return false;
35379 else if (FLOAT_MODE_P (mode))
35381 /* ??? SSE vector cost should be used here. */
35382 *total = cost->fadd;
35383 return false;
35385 /* FALLTHRU */
35387 case AND:
35388 case IOR:
35389 case XOR:
35390 if (GET_MODE_CLASS (mode) == MODE_INT
35391 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35393 *total = (cost->add * 2
35394 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35395 << (GET_MODE (XEXP (x, 0)) != DImode))
35396 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35397 << (GET_MODE (XEXP (x, 1)) != DImode)));
35398 return true;
35400 /* FALLTHRU */
35402 case NEG:
35403 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35405 /* ??? SSE cost should be used here. */
35406 *total = cost->fchs;
35407 return false;
35409 else if (X87_FLOAT_MODE_P (mode))
35411 *total = cost->fchs;
35412 return false;
35414 else if (FLOAT_MODE_P (mode))
35416 /* ??? SSE vector cost should be used here. */
35417 *total = cost->fchs;
35418 return false;
35420 /* FALLTHRU */
35422 case NOT:
35423 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35425 /* ??? Should be SSE vector operation cost. */
35426 /* At least for published AMD latencies, this really is the same
35427 as the latency for a simple fpu operation like fabs. */
35428 *total = cost->fabs;
35430 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35431 *total = cost->add * 2;
35432 else
35433 *total = cost->add;
35434 return false;
35436 case COMPARE:
35437 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35438 && XEXP (XEXP (x, 0), 1) == const1_rtx
35439 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35440 && XEXP (x, 1) == const0_rtx)
35442 /* This kind of construct is implemented using test[bwl].
35443 Treat it as if we had an AND. */
35444 *total = (cost->add
35445 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35446 + rtx_cost (const1_rtx, outer_code, opno, speed));
35447 return true;
35449 return false;
35451 case FLOAT_EXTEND:
35452 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35453 *total = 0;
35454 return false;
35456 case ABS:
35457 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35458 /* ??? SSE cost should be used here. */
35459 *total = cost->fabs;
35460 else if (X87_FLOAT_MODE_P (mode))
35461 *total = cost->fabs;
35462 else if (FLOAT_MODE_P (mode))
35463 /* ??? SSE vector cost should be used here. */
35464 *total = cost->fabs;
35465 return false;
35467 case SQRT:
35468 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35469 /* ??? SSE cost should be used here. */
35470 *total = cost->fsqrt;
35471 else if (X87_FLOAT_MODE_P (mode))
35472 *total = cost->fsqrt;
35473 else if (FLOAT_MODE_P (mode))
35474 /* ??? SSE vector cost should be used here. */
35475 *total = cost->fsqrt;
35476 return false;
35478 case UNSPEC:
35479 if (XINT (x, 1) == UNSPEC_TP)
35480 *total = 0;
35481 return false;
35483 case VEC_SELECT:
35484 case VEC_CONCAT:
35485 case VEC_MERGE:
35486 case VEC_DUPLICATE:
35487 /* ??? Assume all of these vector manipulation patterns are
35488 recognizable. In which case they all pretty much have the
35489 same cost. */
35490 *total = cost->fabs;
35491 return true;
35493 default:
35494 return false;
35498 #if TARGET_MACHO
35500 static int current_machopic_label_num;
35502 /* Given a symbol name and its associated stub, write out the
35503 definition of the stub. */
35505 void
35506 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35508 unsigned int length;
35509 char *binder_name, *symbol_name, lazy_ptr_name[32];
35510 int label = ++current_machopic_label_num;
35512 /* For 64-bit we shouldn't get here. */
35513 gcc_assert (!TARGET_64BIT);
35515 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35516 symb = targetm.strip_name_encoding (symb);
35518 length = strlen (stub);
35519 binder_name = XALLOCAVEC (char, length + 32);
35520 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35522 length = strlen (symb);
35523 symbol_name = XALLOCAVEC (char, length + 32);
35524 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35526 sprintf (lazy_ptr_name, "L%d$lz", label);
35528 if (MACHOPIC_ATT_STUB)
35529 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35530 else if (MACHOPIC_PURE)
35531 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35532 else
35533 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35535 fprintf (file, "%s:\n", stub);
35536 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35538 if (MACHOPIC_ATT_STUB)
35540 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35542 else if (MACHOPIC_PURE)
35544 /* PIC stub. */
35545 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35546 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35547 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35548 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35549 label, lazy_ptr_name, label);
35550 fprintf (file, "\tjmp\t*%%ecx\n");
35552 else
35553 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35555 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35556 it needs no stub-binding-helper. */
35557 if (MACHOPIC_ATT_STUB)
35558 return;
35560 fprintf (file, "%s:\n", binder_name);
35562 if (MACHOPIC_PURE)
35564 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35565 fprintf (file, "\tpushl\t%%ecx\n");
35567 else
35568 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35570 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35572 /* N.B. Keep the correspondence of these
35573 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35574 old-pic/new-pic/non-pic stubs; altering this will break
35575 compatibility with existing dylibs. */
35576 if (MACHOPIC_PURE)
35578 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35579 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35581 else
35582 /* 16-byte -mdynamic-no-pic stub. */
35583 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35585 fprintf (file, "%s:\n", lazy_ptr_name);
35586 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35587 fprintf (file, ASM_LONG "%s\n", binder_name);
35589 #endif /* TARGET_MACHO */
35591 /* Order the registers for register allocator. */
35593 void
35594 x86_order_regs_for_local_alloc (void)
35596 int pos = 0;
35597 int i;
35599 /* First allocate the local general purpose registers. */
35600 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35601 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35602 reg_alloc_order [pos++] = i;
35604 /* Global general purpose registers. */
35605 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35606 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35607 reg_alloc_order [pos++] = i;
35609 /* x87 registers come first in case we are doing FP math
35610 using them. */
35611 if (!TARGET_SSE_MATH)
35612 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35613 reg_alloc_order [pos++] = i;
35615 /* SSE registers. */
35616 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35617 reg_alloc_order [pos++] = i;
35618 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35619 reg_alloc_order [pos++] = i;
35621 /* Extended REX SSE registers. */
35622 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35623 reg_alloc_order [pos++] = i;
35625 /* Mask register. */
35626 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35627 reg_alloc_order [pos++] = i;
35629 /* MPX bound registers. */
35630 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
35631 reg_alloc_order [pos++] = i;
35633 /* x87 registers. */
35634 if (TARGET_SSE_MATH)
35635 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35636 reg_alloc_order [pos++] = i;
35638 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35639 reg_alloc_order [pos++] = i;
35641 /* Initialize the rest of array as we do not allocate some registers
35642 at all. */
35643 while (pos < FIRST_PSEUDO_REGISTER)
35644 reg_alloc_order [pos++] = 0;
35647 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35648 in struct attribute_spec handler. */
35649 static tree
35650 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35651 tree args,
35652 int flags ATTRIBUTE_UNUSED,
35653 bool *no_add_attrs)
35655 if (TREE_CODE (*node) != FUNCTION_TYPE
35656 && TREE_CODE (*node) != METHOD_TYPE
35657 && TREE_CODE (*node) != FIELD_DECL
35658 && TREE_CODE (*node) != TYPE_DECL)
35660 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35661 name);
35662 *no_add_attrs = true;
35663 return NULL_TREE;
35665 if (TARGET_64BIT)
35667 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35668 name);
35669 *no_add_attrs = true;
35670 return NULL_TREE;
35672 if (is_attribute_p ("callee_pop_aggregate_return", name))
35674 tree cst;
35676 cst = TREE_VALUE (args);
35677 if (TREE_CODE (cst) != INTEGER_CST)
35679 warning (OPT_Wattributes,
35680 "%qE attribute requires an integer constant argument",
35681 name);
35682 *no_add_attrs = true;
35684 else if (compare_tree_int (cst, 0) != 0
35685 && compare_tree_int (cst, 1) != 0)
35687 warning (OPT_Wattributes,
35688 "argument to %qE attribute is neither zero, nor one",
35689 name);
35690 *no_add_attrs = true;
35693 return NULL_TREE;
35696 return NULL_TREE;
35699 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35700 struct attribute_spec.handler. */
35701 static tree
35702 ix86_handle_abi_attribute (tree *node, tree name,
35703 tree args ATTRIBUTE_UNUSED,
35704 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35706 if (TREE_CODE (*node) != FUNCTION_TYPE
35707 && TREE_CODE (*node) != METHOD_TYPE
35708 && TREE_CODE (*node) != FIELD_DECL
35709 && TREE_CODE (*node) != TYPE_DECL)
35711 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35712 name);
35713 *no_add_attrs = true;
35714 return NULL_TREE;
35717 /* Can combine regparm with all attributes but fastcall. */
35718 if (is_attribute_p ("ms_abi", name))
35720 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35722 error ("ms_abi and sysv_abi attributes are not compatible");
35725 return NULL_TREE;
35727 else if (is_attribute_p ("sysv_abi", name))
35729 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35731 error ("ms_abi and sysv_abi attributes are not compatible");
35734 return NULL_TREE;
35737 return NULL_TREE;
35740 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35741 struct attribute_spec.handler. */
35742 static tree
35743 ix86_handle_struct_attribute (tree *node, tree name,
35744 tree args ATTRIBUTE_UNUSED,
35745 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35747 tree *type = NULL;
35748 if (DECL_P (*node))
35750 if (TREE_CODE (*node) == TYPE_DECL)
35751 type = &TREE_TYPE (*node);
35753 else
35754 type = node;
35756 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35758 warning (OPT_Wattributes, "%qE attribute ignored",
35759 name);
35760 *no_add_attrs = true;
35763 else if ((is_attribute_p ("ms_struct", name)
35764 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35765 || ((is_attribute_p ("gcc_struct", name)
35766 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35768 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35769 name);
35770 *no_add_attrs = true;
35773 return NULL_TREE;
35776 static tree
35777 ix86_handle_fndecl_attribute (tree *node, tree name,
35778 tree args ATTRIBUTE_UNUSED,
35779 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35781 if (TREE_CODE (*node) != FUNCTION_DECL)
35783 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35784 name);
35785 *no_add_attrs = true;
35787 return NULL_TREE;
35790 static bool
35791 ix86_ms_bitfield_layout_p (const_tree record_type)
35793 return ((TARGET_MS_BITFIELD_LAYOUT
35794 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35795 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35798 /* Returns an expression indicating where the this parameter is
35799 located on entry to the FUNCTION. */
35801 static rtx
35802 x86_this_parameter (tree function)
35804 tree type = TREE_TYPE (function);
35805 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35806 int nregs;
35808 if (TARGET_64BIT)
35810 const int *parm_regs;
35812 if (ix86_function_type_abi (type) == MS_ABI)
35813 parm_regs = x86_64_ms_abi_int_parameter_registers;
35814 else
35815 parm_regs = x86_64_int_parameter_registers;
35816 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35819 nregs = ix86_function_regparm (type, function);
35821 if (nregs > 0 && !stdarg_p (type))
35823 int regno;
35824 unsigned int ccvt = ix86_get_callcvt (type);
35826 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35827 regno = aggr ? DX_REG : CX_REG;
35828 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35830 regno = CX_REG;
35831 if (aggr)
35832 return gen_rtx_MEM (SImode,
35833 plus_constant (Pmode, stack_pointer_rtx, 4));
35835 else
35837 regno = AX_REG;
35838 if (aggr)
35840 regno = DX_REG;
35841 if (nregs == 1)
35842 return gen_rtx_MEM (SImode,
35843 plus_constant (Pmode,
35844 stack_pointer_rtx, 4));
35847 return gen_rtx_REG (SImode, regno);
35850 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35851 aggr ? 8 : 4));
35854 /* Determine whether x86_output_mi_thunk can succeed. */
35856 static bool
35857 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35858 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35859 HOST_WIDE_INT vcall_offset, const_tree function)
35861 /* 64-bit can handle anything. */
35862 if (TARGET_64BIT)
35863 return true;
35865 /* For 32-bit, everything's fine if we have one free register. */
35866 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35867 return true;
35869 /* Need a free register for vcall_offset. */
35870 if (vcall_offset)
35871 return false;
35873 /* Need a free register for GOT references. */
35874 if (flag_pic && !targetm.binds_local_p (function))
35875 return false;
35877 /* Otherwise ok. */
35878 return true;
35881 /* Output the assembler code for a thunk function. THUNK_DECL is the
35882 declaration for the thunk function itself, FUNCTION is the decl for
35883 the target function. DELTA is an immediate constant offset to be
35884 added to THIS. If VCALL_OFFSET is nonzero, the word at
35885 *(*this + vcall_offset) should be added to THIS. */
35887 static void
35888 x86_output_mi_thunk (FILE *file,
35889 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
35890 HOST_WIDE_INT vcall_offset, tree function)
35892 rtx this_param = x86_this_parameter (function);
35893 rtx this_reg, tmp, fnaddr;
35894 unsigned int tmp_regno;
35896 if (TARGET_64BIT)
35897 tmp_regno = R10_REG;
35898 else
35900 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
35901 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35902 tmp_regno = AX_REG;
35903 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35904 tmp_regno = DX_REG;
35905 else
35906 tmp_regno = CX_REG;
35909 emit_note (NOTE_INSN_PROLOGUE_END);
35911 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
35912 pull it in now and let DELTA benefit. */
35913 if (REG_P (this_param))
35914 this_reg = this_param;
35915 else if (vcall_offset)
35917 /* Put the this parameter into %eax. */
35918 this_reg = gen_rtx_REG (Pmode, AX_REG);
35919 emit_move_insn (this_reg, this_param);
35921 else
35922 this_reg = NULL_RTX;
35924 /* Adjust the this parameter by a fixed constant. */
35925 if (delta)
35927 rtx delta_rtx = GEN_INT (delta);
35928 rtx delta_dst = this_reg ? this_reg : this_param;
35930 if (TARGET_64BIT)
35932 if (!x86_64_general_operand (delta_rtx, Pmode))
35934 tmp = gen_rtx_REG (Pmode, tmp_regno);
35935 emit_move_insn (tmp, delta_rtx);
35936 delta_rtx = tmp;
35940 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35943 /* Adjust the this parameter by a value stored in the vtable. */
35944 if (vcall_offset)
35946 rtx vcall_addr, vcall_mem, this_mem;
35948 tmp = gen_rtx_REG (Pmode, tmp_regno);
35950 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35951 if (Pmode != ptr_mode)
35952 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35953 emit_move_insn (tmp, this_mem);
35955 /* Adjust the this parameter. */
35956 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35957 if (TARGET_64BIT
35958 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35960 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35961 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35962 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35965 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35966 if (Pmode != ptr_mode)
35967 emit_insn (gen_addsi_1_zext (this_reg,
35968 gen_rtx_REG (ptr_mode,
35969 REGNO (this_reg)),
35970 vcall_mem));
35971 else
35972 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35975 /* If necessary, drop THIS back to its stack slot. */
35976 if (this_reg && this_reg != this_param)
35977 emit_move_insn (this_param, this_reg);
35979 fnaddr = XEXP (DECL_RTL (function), 0);
35980 if (TARGET_64BIT)
35982 if (!flag_pic || targetm.binds_local_p (function)
35983 || TARGET_PECOFF)
35985 else
35987 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35988 tmp = gen_rtx_CONST (Pmode, tmp);
35989 fnaddr = gen_rtx_MEM (Pmode, tmp);
35992 else
35994 if (!flag_pic || targetm.binds_local_p (function))
35996 #if TARGET_MACHO
35997 else if (TARGET_MACHO)
35999 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36000 fnaddr = XEXP (fnaddr, 0);
36002 #endif /* TARGET_MACHO */
36003 else
36005 tmp = gen_rtx_REG (Pmode, CX_REG);
36006 output_set_got (tmp, NULL_RTX);
36008 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36009 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36010 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36014 /* Our sibling call patterns do not allow memories, because we have no
36015 predicate that can distinguish between frame and non-frame memory.
36016 For our purposes here, we can get away with (ab)using a jump pattern,
36017 because we're going to do no optimization. */
36018 if (MEM_P (fnaddr))
36019 emit_jump_insn (gen_indirect_jump (fnaddr));
36020 else
36022 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36023 fnaddr = legitimize_pic_address (fnaddr,
36024 gen_rtx_REG (Pmode, tmp_regno));
36026 if (!sibcall_insn_operand (fnaddr, word_mode))
36028 tmp = gen_rtx_REG (word_mode, tmp_regno);
36029 if (GET_MODE (fnaddr) != word_mode)
36030 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36031 emit_move_insn (tmp, fnaddr);
36032 fnaddr = tmp;
36035 tmp = gen_rtx_MEM (QImode, fnaddr);
36036 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36037 tmp = emit_call_insn (tmp);
36038 SIBLING_CALL_P (tmp) = 1;
36040 emit_barrier ();
36042 /* Emit just enough of rest_of_compilation to get the insns emitted.
36043 Note that use_thunk calls assemble_start_function et al. */
36044 tmp = get_insns ();
36045 shorten_branches (tmp);
36046 final_start_function (tmp, file, 1);
36047 final (tmp, file, 1);
36048 final_end_function ();
36051 static void
36052 x86_file_start (void)
36054 default_file_start ();
36055 #if TARGET_MACHO
36056 darwin_file_start ();
36057 #endif
36058 if (X86_FILE_START_VERSION_DIRECTIVE)
36059 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36060 if (X86_FILE_START_FLTUSED)
36061 fputs ("\t.global\t__fltused\n", asm_out_file);
36062 if (ix86_asm_dialect == ASM_INTEL)
36063 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36067 x86_field_alignment (tree field, int computed)
36069 enum machine_mode mode;
36070 tree type = TREE_TYPE (field);
36072 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36073 return computed;
36074 mode = TYPE_MODE (strip_array_types (type));
36075 if (mode == DFmode || mode == DCmode
36076 || GET_MODE_CLASS (mode) == MODE_INT
36077 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36078 return MIN (32, computed);
36079 return computed;
36082 /* Output assembler code to FILE to increment profiler label # LABELNO
36083 for profiling a function entry. */
36084 void
36085 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36087 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36088 : MCOUNT_NAME);
36090 if (TARGET_64BIT)
36092 #ifndef NO_PROFILE_COUNTERS
36093 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36094 #endif
36096 if (!TARGET_PECOFF && flag_pic)
36097 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36098 else
36099 fprintf (file, "\tcall\t%s\n", mcount_name);
36101 else if (flag_pic)
36103 #ifndef NO_PROFILE_COUNTERS
36104 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36105 LPREFIX, labelno);
36106 #endif
36107 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36109 else
36111 #ifndef NO_PROFILE_COUNTERS
36112 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36113 LPREFIX, labelno);
36114 #endif
36115 fprintf (file, "\tcall\t%s\n", mcount_name);
36119 /* We don't have exact information about the insn sizes, but we may assume
36120 quite safely that we are informed about all 1 byte insns and memory
36121 address sizes. This is enough to eliminate unnecessary padding in
36122 99% of cases. */
36124 static int
36125 min_insn_size (rtx insn)
36127 int l = 0, len;
36129 if (!INSN_P (insn) || !active_insn_p (insn))
36130 return 0;
36132 /* Discard alignments we've emit and jump instructions. */
36133 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36134 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36135 return 0;
36137 /* Important case - calls are always 5 bytes.
36138 It is common to have many calls in the row. */
36139 if (CALL_P (insn)
36140 && symbolic_reference_mentioned_p (PATTERN (insn))
36141 && !SIBLING_CALL_P (insn))
36142 return 5;
36143 len = get_attr_length (insn);
36144 if (len <= 1)
36145 return 1;
36147 /* For normal instructions we rely on get_attr_length being exact,
36148 with a few exceptions. */
36149 if (!JUMP_P (insn))
36151 enum attr_type type = get_attr_type (insn);
36153 switch (type)
36155 case TYPE_MULTI:
36156 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36157 || asm_noperands (PATTERN (insn)) >= 0)
36158 return 0;
36159 break;
36160 case TYPE_OTHER:
36161 case TYPE_FCMP:
36162 break;
36163 default:
36164 /* Otherwise trust get_attr_length. */
36165 return len;
36168 l = get_attr_length_address (insn);
36169 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36170 l = 4;
36172 if (l)
36173 return 1+l;
36174 else
36175 return 2;
36178 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36180 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36181 window. */
36183 static void
36184 ix86_avoid_jump_mispredicts (void)
36186 rtx insn, start = get_insns ();
36187 int nbytes = 0, njumps = 0;
36188 int isjump = 0;
36190 /* Look for all minimal intervals of instructions containing 4 jumps.
36191 The intervals are bounded by START and INSN. NBYTES is the total
36192 size of instructions in the interval including INSN and not including
36193 START. When the NBYTES is smaller than 16 bytes, it is possible
36194 that the end of START and INSN ends up in the same 16byte page.
36196 The smallest offset in the page INSN can start is the case where START
36197 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36198 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36200 for (insn = start; insn; insn = NEXT_INSN (insn))
36202 int min_size;
36204 if (LABEL_P (insn))
36206 int align = label_to_alignment (insn);
36207 int max_skip = label_to_max_skip (insn);
36209 if (max_skip > 15)
36210 max_skip = 15;
36211 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36212 already in the current 16 byte page, because otherwise
36213 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36214 bytes to reach 16 byte boundary. */
36215 if (align <= 0
36216 || (align <= 3 && max_skip != (1 << align) - 1))
36217 max_skip = 0;
36218 if (dump_file)
36219 fprintf (dump_file, "Label %i with max_skip %i\n",
36220 INSN_UID (insn), max_skip);
36221 if (max_skip)
36223 while (nbytes + max_skip >= 16)
36225 start = NEXT_INSN (start);
36226 if (JUMP_P (start) || CALL_P (start))
36227 njumps--, isjump = 1;
36228 else
36229 isjump = 0;
36230 nbytes -= min_insn_size (start);
36233 continue;
36236 min_size = min_insn_size (insn);
36237 nbytes += min_size;
36238 if (dump_file)
36239 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36240 INSN_UID (insn), min_size);
36241 if (JUMP_P (insn) || CALL_P (insn))
36242 njumps++;
36243 else
36244 continue;
36246 while (njumps > 3)
36248 start = NEXT_INSN (start);
36249 if (JUMP_P (start) || CALL_P (start))
36250 njumps--, isjump = 1;
36251 else
36252 isjump = 0;
36253 nbytes -= min_insn_size (start);
36255 gcc_assert (njumps >= 0);
36256 if (dump_file)
36257 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36258 INSN_UID (start), INSN_UID (insn), nbytes);
36260 if (njumps == 3 && isjump && nbytes < 16)
36262 int padsize = 15 - nbytes + min_insn_size (insn);
36264 if (dump_file)
36265 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36266 INSN_UID (insn), padsize);
36267 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36271 #endif
36273 /* AMD Athlon works faster
36274 when RET is not destination of conditional jump or directly preceded
36275 by other jump instruction. We avoid the penalty by inserting NOP just
36276 before the RET instructions in such cases. */
36277 static void
36278 ix86_pad_returns (void)
36280 edge e;
36281 edge_iterator ei;
36283 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36285 basic_block bb = e->src;
36286 rtx ret = BB_END (bb);
36287 rtx prev;
36288 bool replace = false;
36290 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36291 || optimize_bb_for_size_p (bb))
36292 continue;
36293 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36294 if (active_insn_p (prev) || LABEL_P (prev))
36295 break;
36296 if (prev && LABEL_P (prev))
36298 edge e;
36299 edge_iterator ei;
36301 FOR_EACH_EDGE (e, ei, bb->preds)
36302 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36303 && !(e->flags & EDGE_FALLTHRU))
36305 replace = true;
36306 break;
36309 if (!replace)
36311 prev = prev_active_insn (ret);
36312 if (prev
36313 && ((JUMP_P (prev) && any_condjump_p (prev))
36314 || CALL_P (prev)))
36315 replace = true;
36316 /* Empty functions get branch mispredict even when
36317 the jump destination is not visible to us. */
36318 if (!prev && !optimize_function_for_size_p (cfun))
36319 replace = true;
36321 if (replace)
36323 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36324 delete_insn (ret);
36329 /* Count the minimum number of instructions in BB. Return 4 if the
36330 number of instructions >= 4. */
36332 static int
36333 ix86_count_insn_bb (basic_block bb)
36335 rtx insn;
36336 int insn_count = 0;
36338 /* Count number of instructions in this block. Return 4 if the number
36339 of instructions >= 4. */
36340 FOR_BB_INSNS (bb, insn)
36342 /* Only happen in exit blocks. */
36343 if (JUMP_P (insn)
36344 && ANY_RETURN_P (PATTERN (insn)))
36345 break;
36347 if (NONDEBUG_INSN_P (insn)
36348 && GET_CODE (PATTERN (insn)) != USE
36349 && GET_CODE (PATTERN (insn)) != CLOBBER)
36351 insn_count++;
36352 if (insn_count >= 4)
36353 return insn_count;
36357 return insn_count;
36361 /* Count the minimum number of instructions in code path in BB.
36362 Return 4 if the number of instructions >= 4. */
36364 static int
36365 ix86_count_insn (basic_block bb)
36367 edge e;
36368 edge_iterator ei;
36369 int min_prev_count;
36371 /* Only bother counting instructions along paths with no
36372 more than 2 basic blocks between entry and exit. Given
36373 that BB has an edge to exit, determine if a predecessor
36374 of BB has an edge from entry. If so, compute the number
36375 of instructions in the predecessor block. If there
36376 happen to be multiple such blocks, compute the minimum. */
36377 min_prev_count = 4;
36378 FOR_EACH_EDGE (e, ei, bb->preds)
36380 edge prev_e;
36381 edge_iterator prev_ei;
36383 if (e->src == ENTRY_BLOCK_PTR)
36385 min_prev_count = 0;
36386 break;
36388 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36390 if (prev_e->src == ENTRY_BLOCK_PTR)
36392 int count = ix86_count_insn_bb (e->src);
36393 if (count < min_prev_count)
36394 min_prev_count = count;
36395 break;
36400 if (min_prev_count < 4)
36401 min_prev_count += ix86_count_insn_bb (bb);
36403 return min_prev_count;
36406 /* Pad short function to 4 instructions. */
36408 static void
36409 ix86_pad_short_function (void)
36411 edge e;
36412 edge_iterator ei;
36414 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36416 rtx ret = BB_END (e->src);
36417 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36419 int insn_count = ix86_count_insn (e->src);
36421 /* Pad short function. */
36422 if (insn_count < 4)
36424 rtx insn = ret;
36426 /* Find epilogue. */
36427 while (insn
36428 && (!NOTE_P (insn)
36429 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36430 insn = PREV_INSN (insn);
36432 if (!insn)
36433 insn = ret;
36435 /* Two NOPs count as one instruction. */
36436 insn_count = 2 * (4 - insn_count);
36437 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36443 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36444 the epilogue, the Windows system unwinder will apply epilogue logic and
36445 produce incorrect offsets. This can be avoided by adding a nop between
36446 the last insn that can throw and the first insn of the epilogue. */
36448 static void
36449 ix86_seh_fixup_eh_fallthru (void)
36451 edge e;
36452 edge_iterator ei;
36454 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36456 rtx insn, next;
36458 /* Find the beginning of the epilogue. */
36459 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36460 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36461 break;
36462 if (insn == NULL)
36463 continue;
36465 /* We only care about preceding insns that can throw. */
36466 insn = prev_active_insn (insn);
36467 if (insn == NULL || !can_throw_internal (insn))
36468 continue;
36470 /* Do not separate calls from their debug information. */
36471 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36472 if (NOTE_P (next)
36473 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36474 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36475 insn = next;
36476 else
36477 break;
36479 emit_insn_after (gen_nops (const1_rtx), insn);
36483 /* Implement machine specific optimizations. We implement padding of returns
36484 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36485 static void
36486 ix86_reorg (void)
36488 /* We are freeing block_for_insn in the toplev to keep compatibility
36489 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36490 compute_bb_for_insn ();
36492 if (TARGET_SEH && current_function_has_exception_handlers ())
36493 ix86_seh_fixup_eh_fallthru ();
36495 if (optimize && optimize_function_for_speed_p (cfun))
36497 if (TARGET_PAD_SHORT_FUNCTION)
36498 ix86_pad_short_function ();
36499 else if (TARGET_PAD_RETURNS)
36500 ix86_pad_returns ();
36501 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36502 if (TARGET_FOUR_JUMP_LIMIT)
36503 ix86_avoid_jump_mispredicts ();
36504 #endif
36508 /* Return nonzero when QImode register that must be represented via REX prefix
36509 is used. */
36510 bool
36511 x86_extended_QIreg_mentioned_p (rtx insn)
36513 int i;
36514 extract_insn_cached (insn);
36515 for (i = 0; i < recog_data.n_operands; i++)
36516 if (GENERAL_REG_P (recog_data.operand[i])
36517 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36518 return true;
36519 return false;
36522 /* Return nonzero when P points to register encoded via REX prefix.
36523 Called via for_each_rtx. */
36524 static int
36525 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36527 unsigned int regno;
36528 if (!REG_P (*p))
36529 return 0;
36530 regno = REGNO (*p);
36531 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36534 /* Return true when INSN mentions register that must be encoded using REX
36535 prefix. */
36536 bool
36537 x86_extended_reg_mentioned_p (rtx insn)
36539 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36540 extended_reg_mentioned_1, NULL);
36543 /* If profitable, negate (without causing overflow) integer constant
36544 of mode MODE at location LOC. Return true in this case. */
36545 bool
36546 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36548 HOST_WIDE_INT val;
36550 if (!CONST_INT_P (*loc))
36551 return false;
36553 switch (mode)
36555 case DImode:
36556 /* DImode x86_64 constants must fit in 32 bits. */
36557 gcc_assert (x86_64_immediate_operand (*loc, mode));
36559 mode = SImode;
36560 break;
36562 case SImode:
36563 case HImode:
36564 case QImode:
36565 break;
36567 default:
36568 gcc_unreachable ();
36571 /* Avoid overflows. */
36572 if (mode_signbit_p (mode, *loc))
36573 return false;
36575 val = INTVAL (*loc);
36577 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36578 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36579 if ((val < 0 && val != -128)
36580 || val == 128)
36582 *loc = GEN_INT (-val);
36583 return true;
36586 return false;
36589 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36590 optabs would emit if we didn't have TFmode patterns. */
36592 void
36593 x86_emit_floatuns (rtx operands[2])
36595 rtx neglab, donelab, i0, i1, f0, in, out;
36596 enum machine_mode mode, inmode;
36598 inmode = GET_MODE (operands[1]);
36599 gcc_assert (inmode == SImode || inmode == DImode);
36601 out = operands[0];
36602 in = force_reg (inmode, operands[1]);
36603 mode = GET_MODE (out);
36604 neglab = gen_label_rtx ();
36605 donelab = gen_label_rtx ();
36606 f0 = gen_reg_rtx (mode);
36608 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36610 expand_float (out, in, 0);
36612 emit_jump_insn (gen_jump (donelab));
36613 emit_barrier ();
36615 emit_label (neglab);
36617 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36618 1, OPTAB_DIRECT);
36619 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36620 1, OPTAB_DIRECT);
36621 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36623 expand_float (f0, i0, 0);
36625 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36627 emit_label (donelab);
36630 /* AVX512F does support 64-byte integer vector operations,
36631 thus the longest vector we are faced with is V64QImode. */
36632 #define MAX_VECT_LEN 64
36634 struct expand_vec_perm_d
36636 rtx target, op0, op1;
36637 unsigned char perm[MAX_VECT_LEN];
36638 enum machine_mode vmode;
36639 unsigned char nelt;
36640 bool one_operand_p;
36641 bool testing_p;
36644 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36645 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36646 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36648 /* Get a vector mode of the same size as the original but with elements
36649 twice as wide. This is only guaranteed to apply to integral vectors. */
36651 static inline enum machine_mode
36652 get_mode_wider_vector (enum machine_mode o)
36654 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36655 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36656 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36657 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36658 return n;
36661 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36662 with all elements equal to VAR. Return true if successful. */
36664 static bool
36665 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36666 rtx target, rtx val)
36668 bool ok;
36670 switch (mode)
36672 case V2SImode:
36673 case V2SFmode:
36674 if (!mmx_ok)
36675 return false;
36676 /* FALLTHRU */
36678 case V4DFmode:
36679 case V4DImode:
36680 case V8SFmode:
36681 case V8SImode:
36682 case V2DFmode:
36683 case V2DImode:
36684 case V4SFmode:
36685 case V4SImode:
36687 rtx insn, dup;
36689 /* First attempt to recognize VAL as-is. */
36690 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36691 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36692 if (recog_memoized (insn) < 0)
36694 rtx seq;
36695 /* If that fails, force VAL into a register. */
36697 start_sequence ();
36698 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36699 seq = get_insns ();
36700 end_sequence ();
36701 if (seq)
36702 emit_insn_before (seq, insn);
36704 ok = recog_memoized (insn) >= 0;
36705 gcc_assert (ok);
36708 return true;
36710 case V4HImode:
36711 if (!mmx_ok)
36712 return false;
36713 if (TARGET_SSE || TARGET_3DNOW_A)
36715 rtx x;
36717 val = gen_lowpart (SImode, val);
36718 x = gen_rtx_TRUNCATE (HImode, val);
36719 x = gen_rtx_VEC_DUPLICATE (mode, x);
36720 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36721 return true;
36723 goto widen;
36725 case V8QImode:
36726 if (!mmx_ok)
36727 return false;
36728 goto widen;
36730 case V8HImode:
36731 if (TARGET_SSE2)
36733 struct expand_vec_perm_d dperm;
36734 rtx tmp1, tmp2;
36736 permute:
36737 memset (&dperm, 0, sizeof (dperm));
36738 dperm.target = target;
36739 dperm.vmode = mode;
36740 dperm.nelt = GET_MODE_NUNITS (mode);
36741 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36742 dperm.one_operand_p = true;
36744 /* Extend to SImode using a paradoxical SUBREG. */
36745 tmp1 = gen_reg_rtx (SImode);
36746 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36748 /* Insert the SImode value as low element of a V4SImode vector. */
36749 tmp2 = gen_reg_rtx (V4SImode);
36750 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36751 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
36753 ok = (expand_vec_perm_1 (&dperm)
36754 || expand_vec_perm_broadcast_1 (&dperm));
36755 gcc_assert (ok);
36756 return ok;
36758 goto widen;
36760 case V16QImode:
36761 if (TARGET_SSE2)
36762 goto permute;
36763 goto widen;
36765 widen:
36766 /* Replicate the value once into the next wider mode and recurse. */
36768 enum machine_mode smode, wsmode, wvmode;
36769 rtx x;
36771 smode = GET_MODE_INNER (mode);
36772 wvmode = get_mode_wider_vector (mode);
36773 wsmode = GET_MODE_INNER (wvmode);
36775 val = convert_modes (wsmode, smode, val, true);
36776 x = expand_simple_binop (wsmode, ASHIFT, val,
36777 GEN_INT (GET_MODE_BITSIZE (smode)),
36778 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36779 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36781 x = gen_reg_rtx (wvmode);
36782 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36783 gcc_assert (ok);
36784 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
36785 return ok;
36788 case V16HImode:
36789 case V32QImode:
36791 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36792 rtx x = gen_reg_rtx (hvmode);
36794 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36795 gcc_assert (ok);
36797 x = gen_rtx_VEC_CONCAT (mode, x, x);
36798 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36800 return true;
36802 default:
36803 return false;
36807 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36808 whose ONE_VAR element is VAR, and other elements are zero. Return true
36809 if successful. */
36811 static bool
36812 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36813 rtx target, rtx var, int one_var)
36815 enum machine_mode vsimode;
36816 rtx new_target;
36817 rtx x, tmp;
36818 bool use_vector_set = false;
36820 switch (mode)
36822 case V2DImode:
36823 /* For SSE4.1, we normally use vector set. But if the second
36824 element is zero and inter-unit moves are OK, we use movq
36825 instead. */
36826 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36827 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36828 && one_var == 0));
36829 break;
36830 case V16QImode:
36831 case V4SImode:
36832 case V4SFmode:
36833 use_vector_set = TARGET_SSE4_1;
36834 break;
36835 case V8HImode:
36836 use_vector_set = TARGET_SSE2;
36837 break;
36838 case V4HImode:
36839 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36840 break;
36841 case V32QImode:
36842 case V16HImode:
36843 case V8SImode:
36844 case V8SFmode:
36845 case V4DFmode:
36846 use_vector_set = TARGET_AVX;
36847 break;
36848 case V4DImode:
36849 /* Use ix86_expand_vector_set in 64bit mode only. */
36850 use_vector_set = TARGET_AVX && TARGET_64BIT;
36851 break;
36852 default:
36853 break;
36856 if (use_vector_set)
36858 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36859 var = force_reg (GET_MODE_INNER (mode), var);
36860 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36861 return true;
36864 switch (mode)
36866 case V2SFmode:
36867 case V2SImode:
36868 if (!mmx_ok)
36869 return false;
36870 /* FALLTHRU */
36872 case V2DFmode:
36873 case V2DImode:
36874 if (one_var != 0)
36875 return false;
36876 var = force_reg (GET_MODE_INNER (mode), var);
36877 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
36878 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36879 return true;
36881 case V4SFmode:
36882 case V4SImode:
36883 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
36884 new_target = gen_reg_rtx (mode);
36885 else
36886 new_target = target;
36887 var = force_reg (GET_MODE_INNER (mode), var);
36888 x = gen_rtx_VEC_DUPLICATE (mode, var);
36889 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
36890 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
36891 if (one_var != 0)
36893 /* We need to shuffle the value to the correct position, so
36894 create a new pseudo to store the intermediate result. */
36896 /* With SSE2, we can use the integer shuffle insns. */
36897 if (mode != V4SFmode && TARGET_SSE2)
36899 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
36900 const1_rtx,
36901 GEN_INT (one_var == 1 ? 0 : 1),
36902 GEN_INT (one_var == 2 ? 0 : 1),
36903 GEN_INT (one_var == 3 ? 0 : 1)));
36904 if (target != new_target)
36905 emit_move_insn (target, new_target);
36906 return true;
36909 /* Otherwise convert the intermediate result to V4SFmode and
36910 use the SSE1 shuffle instructions. */
36911 if (mode != V4SFmode)
36913 tmp = gen_reg_rtx (V4SFmode);
36914 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36916 else
36917 tmp = new_target;
36919 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36920 const1_rtx,
36921 GEN_INT (one_var == 1 ? 0 : 1),
36922 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36923 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36925 if (mode != V4SFmode)
36926 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36927 else if (tmp != target)
36928 emit_move_insn (target, tmp);
36930 else if (target != new_target)
36931 emit_move_insn (target, new_target);
36932 return true;
36934 case V8HImode:
36935 case V16QImode:
36936 vsimode = V4SImode;
36937 goto widen;
36938 case V4HImode:
36939 case V8QImode:
36940 if (!mmx_ok)
36941 return false;
36942 vsimode = V2SImode;
36943 goto widen;
36944 widen:
36945 if (one_var != 0)
36946 return false;
36948 /* Zero extend the variable element to SImode and recurse. */
36949 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36951 x = gen_reg_rtx (vsimode);
36952 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36953 var, one_var))
36954 gcc_unreachable ();
36956 emit_move_insn (target, gen_lowpart (mode, x));
36957 return true;
36959 default:
36960 return false;
36964 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36965 consisting of the values in VALS. It is known that all elements
36966 except ONE_VAR are constants. Return true if successful. */
36968 static bool
36969 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36970 rtx target, rtx vals, int one_var)
36972 rtx var = XVECEXP (vals, 0, one_var);
36973 enum machine_mode wmode;
36974 rtx const_vec, x;
36976 const_vec = copy_rtx (vals);
36977 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36978 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36980 switch (mode)
36982 case V2DFmode:
36983 case V2DImode:
36984 case V2SFmode:
36985 case V2SImode:
36986 /* For the two element vectors, it's just as easy to use
36987 the general case. */
36988 return false;
36990 case V4DImode:
36991 /* Use ix86_expand_vector_set in 64bit mode only. */
36992 if (!TARGET_64BIT)
36993 return false;
36994 case V4DFmode:
36995 case V8SFmode:
36996 case V8SImode:
36997 case V16HImode:
36998 case V32QImode:
36999 case V4SFmode:
37000 case V4SImode:
37001 case V8HImode:
37002 case V4HImode:
37003 break;
37005 case V16QImode:
37006 if (TARGET_SSE4_1)
37007 break;
37008 wmode = V8HImode;
37009 goto widen;
37010 case V8QImode:
37011 wmode = V4HImode;
37012 goto widen;
37013 widen:
37014 /* There's no way to set one QImode entry easily. Combine
37015 the variable value with its adjacent constant value, and
37016 promote to an HImode set. */
37017 x = XVECEXP (vals, 0, one_var ^ 1);
37018 if (one_var & 1)
37020 var = convert_modes (HImode, QImode, var, true);
37021 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37022 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37023 x = GEN_INT (INTVAL (x) & 0xff);
37025 else
37027 var = convert_modes (HImode, QImode, var, true);
37028 x = gen_int_mode (INTVAL (x) << 8, HImode);
37030 if (x != const0_rtx)
37031 var = expand_simple_binop (HImode, IOR, var, x, var,
37032 1, OPTAB_LIB_WIDEN);
37034 x = gen_reg_rtx (wmode);
37035 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37036 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37038 emit_move_insn (target, gen_lowpart (mode, x));
37039 return true;
37041 default:
37042 return false;
37045 emit_move_insn (target, const_vec);
37046 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37047 return true;
37050 /* A subroutine of ix86_expand_vector_init_general. Use vector
37051 concatenate to handle the most general case: all values variable,
37052 and none identical. */
37054 static void
37055 ix86_expand_vector_init_concat (enum machine_mode mode,
37056 rtx target, rtx *ops, int n)
37058 enum machine_mode cmode, hmode = VOIDmode;
37059 rtx first[8], second[4];
37060 rtvec v;
37061 int i, j;
37063 switch (n)
37065 case 2:
37066 switch (mode)
37068 case V8SImode:
37069 cmode = V4SImode;
37070 break;
37071 case V8SFmode:
37072 cmode = V4SFmode;
37073 break;
37074 case V4DImode:
37075 cmode = V2DImode;
37076 break;
37077 case V4DFmode:
37078 cmode = V2DFmode;
37079 break;
37080 case V4SImode:
37081 cmode = V2SImode;
37082 break;
37083 case V4SFmode:
37084 cmode = V2SFmode;
37085 break;
37086 case V2DImode:
37087 cmode = DImode;
37088 break;
37089 case V2SImode:
37090 cmode = SImode;
37091 break;
37092 case V2DFmode:
37093 cmode = DFmode;
37094 break;
37095 case V2SFmode:
37096 cmode = SFmode;
37097 break;
37098 default:
37099 gcc_unreachable ();
37102 if (!register_operand (ops[1], cmode))
37103 ops[1] = force_reg (cmode, ops[1]);
37104 if (!register_operand (ops[0], cmode))
37105 ops[0] = force_reg (cmode, ops[0]);
37106 emit_insn (gen_rtx_SET (VOIDmode, target,
37107 gen_rtx_VEC_CONCAT (mode, ops[0],
37108 ops[1])));
37109 break;
37111 case 4:
37112 switch (mode)
37114 case V4DImode:
37115 cmode = V2DImode;
37116 break;
37117 case V4DFmode:
37118 cmode = V2DFmode;
37119 break;
37120 case V4SImode:
37121 cmode = V2SImode;
37122 break;
37123 case V4SFmode:
37124 cmode = V2SFmode;
37125 break;
37126 default:
37127 gcc_unreachable ();
37129 goto half;
37131 case 8:
37132 switch (mode)
37134 case V8SImode:
37135 cmode = V2SImode;
37136 hmode = V4SImode;
37137 break;
37138 case V8SFmode:
37139 cmode = V2SFmode;
37140 hmode = V4SFmode;
37141 break;
37142 default:
37143 gcc_unreachable ();
37145 goto half;
37147 half:
37148 /* FIXME: We process inputs backward to help RA. PR 36222. */
37149 i = n - 1;
37150 j = (n >> 1) - 1;
37151 for (; i > 0; i -= 2, j--)
37153 first[j] = gen_reg_rtx (cmode);
37154 v = gen_rtvec (2, ops[i - 1], ops[i]);
37155 ix86_expand_vector_init (false, first[j],
37156 gen_rtx_PARALLEL (cmode, v));
37159 n >>= 1;
37160 if (n > 2)
37162 gcc_assert (hmode != VOIDmode);
37163 for (i = j = 0; i < n; i += 2, j++)
37165 second[j] = gen_reg_rtx (hmode);
37166 ix86_expand_vector_init_concat (hmode, second [j],
37167 &first [i], 2);
37169 n >>= 1;
37170 ix86_expand_vector_init_concat (mode, target, second, n);
37172 else
37173 ix86_expand_vector_init_concat (mode, target, first, n);
37174 break;
37176 default:
37177 gcc_unreachable ();
37181 /* A subroutine of ix86_expand_vector_init_general. Use vector
37182 interleave to handle the most general case: all values variable,
37183 and none identical. */
37185 static void
37186 ix86_expand_vector_init_interleave (enum machine_mode mode,
37187 rtx target, rtx *ops, int n)
37189 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37190 int i, j;
37191 rtx op0, op1;
37192 rtx (*gen_load_even) (rtx, rtx, rtx);
37193 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37194 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37196 switch (mode)
37198 case V8HImode:
37199 gen_load_even = gen_vec_setv8hi;
37200 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37201 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37202 inner_mode = HImode;
37203 first_imode = V4SImode;
37204 second_imode = V2DImode;
37205 third_imode = VOIDmode;
37206 break;
37207 case V16QImode:
37208 gen_load_even = gen_vec_setv16qi;
37209 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37210 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37211 inner_mode = QImode;
37212 first_imode = V8HImode;
37213 second_imode = V4SImode;
37214 third_imode = V2DImode;
37215 break;
37216 default:
37217 gcc_unreachable ();
37220 for (i = 0; i < n; i++)
37222 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37223 op0 = gen_reg_rtx (SImode);
37224 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37226 /* Insert the SImode value as low element of V4SImode vector. */
37227 op1 = gen_reg_rtx (V4SImode);
37228 op0 = gen_rtx_VEC_MERGE (V4SImode,
37229 gen_rtx_VEC_DUPLICATE (V4SImode,
37230 op0),
37231 CONST0_RTX (V4SImode),
37232 const1_rtx);
37233 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37235 /* Cast the V4SImode vector back to a vector in orignal mode. */
37236 op0 = gen_reg_rtx (mode);
37237 emit_move_insn (op0, gen_lowpart (mode, op1));
37239 /* Load even elements into the second position. */
37240 emit_insn (gen_load_even (op0,
37241 force_reg (inner_mode,
37242 ops [i + i + 1]),
37243 const1_rtx));
37245 /* Cast vector to FIRST_IMODE vector. */
37246 ops[i] = gen_reg_rtx (first_imode);
37247 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37250 /* Interleave low FIRST_IMODE vectors. */
37251 for (i = j = 0; i < n; i += 2, j++)
37253 op0 = gen_reg_rtx (first_imode);
37254 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37256 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37257 ops[j] = gen_reg_rtx (second_imode);
37258 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37261 /* Interleave low SECOND_IMODE vectors. */
37262 switch (second_imode)
37264 case V4SImode:
37265 for (i = j = 0; i < n / 2; i += 2, j++)
37267 op0 = gen_reg_rtx (second_imode);
37268 emit_insn (gen_interleave_second_low (op0, ops[i],
37269 ops[i + 1]));
37271 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37272 vector. */
37273 ops[j] = gen_reg_rtx (third_imode);
37274 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37276 second_imode = V2DImode;
37277 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37278 /* FALLTHRU */
37280 case V2DImode:
37281 op0 = gen_reg_rtx (second_imode);
37282 emit_insn (gen_interleave_second_low (op0, ops[0],
37283 ops[1]));
37285 /* Cast the SECOND_IMODE vector back to a vector on original
37286 mode. */
37287 emit_insn (gen_rtx_SET (VOIDmode, target,
37288 gen_lowpart (mode, op0)));
37289 break;
37291 default:
37292 gcc_unreachable ();
37296 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37297 all values variable, and none identical. */
37299 static void
37300 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37301 rtx target, rtx vals)
37303 rtx ops[32], op0, op1;
37304 enum machine_mode half_mode = VOIDmode;
37305 int n, i;
37307 switch (mode)
37309 case V2SFmode:
37310 case V2SImode:
37311 if (!mmx_ok && !TARGET_SSE)
37312 break;
37313 /* FALLTHRU */
37315 case V8SFmode:
37316 case V8SImode:
37317 case V4DFmode:
37318 case V4DImode:
37319 case V4SFmode:
37320 case V4SImode:
37321 case V2DFmode:
37322 case V2DImode:
37323 n = GET_MODE_NUNITS (mode);
37324 for (i = 0; i < n; i++)
37325 ops[i] = XVECEXP (vals, 0, i);
37326 ix86_expand_vector_init_concat (mode, target, ops, n);
37327 return;
37329 case V32QImode:
37330 half_mode = V16QImode;
37331 goto half;
37333 case V16HImode:
37334 half_mode = V8HImode;
37335 goto half;
37337 half:
37338 n = GET_MODE_NUNITS (mode);
37339 for (i = 0; i < n; i++)
37340 ops[i] = XVECEXP (vals, 0, i);
37341 op0 = gen_reg_rtx (half_mode);
37342 op1 = gen_reg_rtx (half_mode);
37343 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37344 n >> 2);
37345 ix86_expand_vector_init_interleave (half_mode, op1,
37346 &ops [n >> 1], n >> 2);
37347 emit_insn (gen_rtx_SET (VOIDmode, target,
37348 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37349 return;
37351 case V16QImode:
37352 if (!TARGET_SSE4_1)
37353 break;
37354 /* FALLTHRU */
37356 case V8HImode:
37357 if (!TARGET_SSE2)
37358 break;
37360 /* Don't use ix86_expand_vector_init_interleave if we can't
37361 move from GPR to SSE register directly. */
37362 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37363 break;
37365 n = GET_MODE_NUNITS (mode);
37366 for (i = 0; i < n; i++)
37367 ops[i] = XVECEXP (vals, 0, i);
37368 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37369 return;
37371 case V4HImode:
37372 case V8QImode:
37373 break;
37375 default:
37376 gcc_unreachable ();
37380 int i, j, n_elts, n_words, n_elt_per_word;
37381 enum machine_mode inner_mode;
37382 rtx words[4], shift;
37384 inner_mode = GET_MODE_INNER (mode);
37385 n_elts = GET_MODE_NUNITS (mode);
37386 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37387 n_elt_per_word = n_elts / n_words;
37388 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37390 for (i = 0; i < n_words; ++i)
37392 rtx word = NULL_RTX;
37394 for (j = 0; j < n_elt_per_word; ++j)
37396 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37397 elt = convert_modes (word_mode, inner_mode, elt, true);
37399 if (j == 0)
37400 word = elt;
37401 else
37403 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37404 word, 1, OPTAB_LIB_WIDEN);
37405 word = expand_simple_binop (word_mode, IOR, word, elt,
37406 word, 1, OPTAB_LIB_WIDEN);
37410 words[i] = word;
37413 if (n_words == 1)
37414 emit_move_insn (target, gen_lowpart (mode, words[0]));
37415 else if (n_words == 2)
37417 rtx tmp = gen_reg_rtx (mode);
37418 emit_clobber (tmp);
37419 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37420 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37421 emit_move_insn (target, tmp);
37423 else if (n_words == 4)
37425 rtx tmp = gen_reg_rtx (V4SImode);
37426 gcc_assert (word_mode == SImode);
37427 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37428 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37429 emit_move_insn (target, gen_lowpart (mode, tmp));
37431 else
37432 gcc_unreachable ();
37436 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37437 instructions unless MMX_OK is true. */
37439 void
37440 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37442 enum machine_mode mode = GET_MODE (target);
37443 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37444 int n_elts = GET_MODE_NUNITS (mode);
37445 int n_var = 0, one_var = -1;
37446 bool all_same = true, all_const_zero = true;
37447 int i;
37448 rtx x;
37450 for (i = 0; i < n_elts; ++i)
37452 x = XVECEXP (vals, 0, i);
37453 if (!(CONST_INT_P (x)
37454 || GET_CODE (x) == CONST_DOUBLE
37455 || GET_CODE (x) == CONST_FIXED))
37456 n_var++, one_var = i;
37457 else if (x != CONST0_RTX (inner_mode))
37458 all_const_zero = false;
37459 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37460 all_same = false;
37463 /* Constants are best loaded from the constant pool. */
37464 if (n_var == 0)
37466 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37467 return;
37470 /* If all values are identical, broadcast the value. */
37471 if (all_same
37472 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37473 XVECEXP (vals, 0, 0)))
37474 return;
37476 /* Values where only one field is non-constant are best loaded from
37477 the pool and overwritten via move later. */
37478 if (n_var == 1)
37480 if (all_const_zero
37481 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37482 XVECEXP (vals, 0, one_var),
37483 one_var))
37484 return;
37486 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37487 return;
37490 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37493 void
37494 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37496 enum machine_mode mode = GET_MODE (target);
37497 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37498 enum machine_mode half_mode;
37499 bool use_vec_merge = false;
37500 rtx tmp;
37501 static rtx (*gen_extract[6][2]) (rtx, rtx)
37503 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37504 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37505 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37506 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37507 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37508 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37510 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37512 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37513 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37514 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37515 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37516 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37517 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37519 int i, j, n;
37521 switch (mode)
37523 case V2SFmode:
37524 case V2SImode:
37525 if (mmx_ok)
37527 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37528 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37529 if (elt == 0)
37530 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37531 else
37532 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37533 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37534 return;
37536 break;
37538 case V2DImode:
37539 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37540 if (use_vec_merge)
37541 break;
37543 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37544 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37545 if (elt == 0)
37546 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37547 else
37548 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37549 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37550 return;
37552 case V2DFmode:
37554 rtx op0, op1;
37556 /* For the two element vectors, we implement a VEC_CONCAT with
37557 the extraction of the other element. */
37559 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37560 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37562 if (elt == 0)
37563 op0 = val, op1 = tmp;
37564 else
37565 op0 = tmp, op1 = val;
37567 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37568 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37570 return;
37572 case V4SFmode:
37573 use_vec_merge = TARGET_SSE4_1;
37574 if (use_vec_merge)
37575 break;
37577 switch (elt)
37579 case 0:
37580 use_vec_merge = true;
37581 break;
37583 case 1:
37584 /* tmp = target = A B C D */
37585 tmp = copy_to_reg (target);
37586 /* target = A A B B */
37587 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37588 /* target = X A B B */
37589 ix86_expand_vector_set (false, target, val, 0);
37590 /* target = A X C D */
37591 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37592 const1_rtx, const0_rtx,
37593 GEN_INT (2+4), GEN_INT (3+4)));
37594 return;
37596 case 2:
37597 /* tmp = target = A B C D */
37598 tmp = copy_to_reg (target);
37599 /* tmp = X B C D */
37600 ix86_expand_vector_set (false, tmp, val, 0);
37601 /* target = A B X D */
37602 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37603 const0_rtx, const1_rtx,
37604 GEN_INT (0+4), GEN_INT (3+4)));
37605 return;
37607 case 3:
37608 /* tmp = target = A B C D */
37609 tmp = copy_to_reg (target);
37610 /* tmp = X B C D */
37611 ix86_expand_vector_set (false, tmp, val, 0);
37612 /* target = A B X D */
37613 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37614 const0_rtx, const1_rtx,
37615 GEN_INT (2+4), GEN_INT (0+4)));
37616 return;
37618 default:
37619 gcc_unreachable ();
37621 break;
37623 case V4SImode:
37624 use_vec_merge = TARGET_SSE4_1;
37625 if (use_vec_merge)
37626 break;
37628 /* Element 0 handled by vec_merge below. */
37629 if (elt == 0)
37631 use_vec_merge = true;
37632 break;
37635 if (TARGET_SSE2)
37637 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37638 store into element 0, then shuffle them back. */
37640 rtx order[4];
37642 order[0] = GEN_INT (elt);
37643 order[1] = const1_rtx;
37644 order[2] = const2_rtx;
37645 order[3] = GEN_INT (3);
37646 order[elt] = const0_rtx;
37648 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37649 order[1], order[2], order[3]));
37651 ix86_expand_vector_set (false, target, val, 0);
37653 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37654 order[1], order[2], order[3]));
37656 else
37658 /* For SSE1, we have to reuse the V4SF code. */
37659 rtx t = gen_reg_rtx (V4SFmode);
37660 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
37661 emit_move_insn (target, gen_lowpart (mode, t));
37663 return;
37665 case V8HImode:
37666 use_vec_merge = TARGET_SSE2;
37667 break;
37668 case V4HImode:
37669 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37670 break;
37672 case V16QImode:
37673 use_vec_merge = TARGET_SSE4_1;
37674 break;
37676 case V8QImode:
37677 break;
37679 case V32QImode:
37680 half_mode = V16QImode;
37681 j = 0;
37682 n = 16;
37683 goto half;
37685 case V16HImode:
37686 half_mode = V8HImode;
37687 j = 1;
37688 n = 8;
37689 goto half;
37691 case V8SImode:
37692 half_mode = V4SImode;
37693 j = 2;
37694 n = 4;
37695 goto half;
37697 case V4DImode:
37698 half_mode = V2DImode;
37699 j = 3;
37700 n = 2;
37701 goto half;
37703 case V8SFmode:
37704 half_mode = V4SFmode;
37705 j = 4;
37706 n = 4;
37707 goto half;
37709 case V4DFmode:
37710 half_mode = V2DFmode;
37711 j = 5;
37712 n = 2;
37713 goto half;
37715 half:
37716 /* Compute offset. */
37717 i = elt / n;
37718 elt %= n;
37720 gcc_assert (i <= 1);
37722 /* Extract the half. */
37723 tmp = gen_reg_rtx (half_mode);
37724 emit_insn (gen_extract[j][i] (tmp, target));
37726 /* Put val in tmp at elt. */
37727 ix86_expand_vector_set (false, tmp, val, elt);
37729 /* Put it back. */
37730 emit_insn (gen_insert[j][i] (target, target, tmp));
37731 return;
37733 default:
37734 break;
37737 if (use_vec_merge)
37739 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37740 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37741 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37743 else
37745 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37747 emit_move_insn (mem, target);
37749 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37750 emit_move_insn (tmp, val);
37752 emit_move_insn (target, mem);
37756 void
37757 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37759 enum machine_mode mode = GET_MODE (vec);
37760 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37761 bool use_vec_extr = false;
37762 rtx tmp;
37764 switch (mode)
37766 case V2SImode:
37767 case V2SFmode:
37768 if (!mmx_ok)
37769 break;
37770 /* FALLTHRU */
37772 case V2DFmode:
37773 case V2DImode:
37774 use_vec_extr = true;
37775 break;
37777 case V4SFmode:
37778 use_vec_extr = TARGET_SSE4_1;
37779 if (use_vec_extr)
37780 break;
37782 switch (elt)
37784 case 0:
37785 tmp = vec;
37786 break;
37788 case 1:
37789 case 3:
37790 tmp = gen_reg_rtx (mode);
37791 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37792 GEN_INT (elt), GEN_INT (elt),
37793 GEN_INT (elt+4), GEN_INT (elt+4)));
37794 break;
37796 case 2:
37797 tmp = gen_reg_rtx (mode);
37798 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37799 break;
37801 default:
37802 gcc_unreachable ();
37804 vec = tmp;
37805 use_vec_extr = true;
37806 elt = 0;
37807 break;
37809 case V4SImode:
37810 use_vec_extr = TARGET_SSE4_1;
37811 if (use_vec_extr)
37812 break;
37814 if (TARGET_SSE2)
37816 switch (elt)
37818 case 0:
37819 tmp = vec;
37820 break;
37822 case 1:
37823 case 3:
37824 tmp = gen_reg_rtx (mode);
37825 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37826 GEN_INT (elt), GEN_INT (elt),
37827 GEN_INT (elt), GEN_INT (elt)));
37828 break;
37830 case 2:
37831 tmp = gen_reg_rtx (mode);
37832 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37833 break;
37835 default:
37836 gcc_unreachable ();
37838 vec = tmp;
37839 use_vec_extr = true;
37840 elt = 0;
37842 else
37844 /* For SSE1, we have to reuse the V4SF code. */
37845 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37846 gen_lowpart (V4SFmode, vec), elt);
37847 return;
37849 break;
37851 case V8HImode:
37852 use_vec_extr = TARGET_SSE2;
37853 break;
37854 case V4HImode:
37855 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37856 break;
37858 case V16QImode:
37859 use_vec_extr = TARGET_SSE4_1;
37860 break;
37862 case V8SFmode:
37863 if (TARGET_AVX)
37865 tmp = gen_reg_rtx (V4SFmode);
37866 if (elt < 4)
37867 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37868 else
37869 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37870 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37871 return;
37873 break;
37875 case V4DFmode:
37876 if (TARGET_AVX)
37878 tmp = gen_reg_rtx (V2DFmode);
37879 if (elt < 2)
37880 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
37881 else
37882 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
37883 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37884 return;
37886 break;
37888 case V32QImode:
37889 if (TARGET_AVX)
37891 tmp = gen_reg_rtx (V16QImode);
37892 if (elt < 16)
37893 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
37894 else
37895 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
37896 ix86_expand_vector_extract (false, target, tmp, elt & 15);
37897 return;
37899 break;
37901 case V16HImode:
37902 if (TARGET_AVX)
37904 tmp = gen_reg_rtx (V8HImode);
37905 if (elt < 8)
37906 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
37907 else
37908 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
37909 ix86_expand_vector_extract (false, target, tmp, elt & 7);
37910 return;
37912 break;
37914 case V8SImode:
37915 if (TARGET_AVX)
37917 tmp = gen_reg_rtx (V4SImode);
37918 if (elt < 4)
37919 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37920 else
37921 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37922 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37923 return;
37925 break;
37927 case V4DImode:
37928 if (TARGET_AVX)
37930 tmp = gen_reg_rtx (V2DImode);
37931 if (elt < 2)
37932 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37933 else
37934 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37935 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37936 return;
37938 break;
37940 case V8QImode:
37941 /* ??? Could extract the appropriate HImode element and shift. */
37942 default:
37943 break;
37946 if (use_vec_extr)
37948 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37949 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37951 /* Let the rtl optimizers know about the zero extension performed. */
37952 if (inner_mode == QImode || inner_mode == HImode)
37954 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37955 target = gen_lowpart (SImode, target);
37958 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37960 else
37962 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37964 emit_move_insn (mem, vec);
37966 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37967 emit_move_insn (target, tmp);
37971 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37972 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37973 The upper bits of DEST are undefined, though they shouldn't cause
37974 exceptions (some bits from src or all zeros are ok). */
37976 static void
37977 emit_reduc_half (rtx dest, rtx src, int i)
37979 rtx tem, d = dest;
37980 switch (GET_MODE (src))
37982 case V4SFmode:
37983 if (i == 128)
37984 tem = gen_sse_movhlps (dest, src, src);
37985 else
37986 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37987 GEN_INT (1 + 4), GEN_INT (1 + 4));
37988 break;
37989 case V2DFmode:
37990 tem = gen_vec_interleave_highv2df (dest, src, src);
37991 break;
37992 case V16QImode:
37993 case V8HImode:
37994 case V4SImode:
37995 case V2DImode:
37996 d = gen_reg_rtx (V1TImode);
37997 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
37998 GEN_INT (i / 2));
37999 break;
38000 case V8SFmode:
38001 if (i == 256)
38002 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38003 else
38004 tem = gen_avx_shufps256 (dest, src, src,
38005 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38006 break;
38007 case V4DFmode:
38008 if (i == 256)
38009 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38010 else
38011 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38012 break;
38013 case V32QImode:
38014 case V16HImode:
38015 case V8SImode:
38016 case V4DImode:
38017 if (i == 256)
38019 if (GET_MODE (dest) != V4DImode)
38020 d = gen_reg_rtx (V4DImode);
38021 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38022 gen_lowpart (V4DImode, src),
38023 const1_rtx);
38025 else
38027 d = gen_reg_rtx (V2TImode);
38028 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38029 GEN_INT (i / 2));
38031 break;
38032 default:
38033 gcc_unreachable ();
38035 emit_insn (tem);
38036 if (d != dest)
38037 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38040 /* Expand a vector reduction. FN is the binary pattern to reduce;
38041 DEST is the destination; IN is the input vector. */
38043 void
38044 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38046 rtx half, dst, vec = in;
38047 enum machine_mode mode = GET_MODE (in);
38048 int i;
38050 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38051 if (TARGET_SSE4_1
38052 && mode == V8HImode
38053 && fn == gen_uminv8hi3)
38055 emit_insn (gen_sse4_1_phminposuw (dest, in));
38056 return;
38059 for (i = GET_MODE_BITSIZE (mode);
38060 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38061 i >>= 1)
38063 half = gen_reg_rtx (mode);
38064 emit_reduc_half (half, vec, i);
38065 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38066 dst = dest;
38067 else
38068 dst = gen_reg_rtx (mode);
38069 emit_insn (fn (dst, half, vec));
38070 vec = dst;
38074 /* Target hook for scalar_mode_supported_p. */
38075 static bool
38076 ix86_scalar_mode_supported_p (enum machine_mode mode)
38078 if (DECIMAL_FLOAT_MODE_P (mode))
38079 return default_decimal_float_supported_p ();
38080 else if (mode == TFmode)
38081 return true;
38082 else
38083 return default_scalar_mode_supported_p (mode);
38086 /* Implements target hook vector_mode_supported_p. */
38087 static bool
38088 ix86_vector_mode_supported_p (enum machine_mode mode)
38090 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38091 return true;
38092 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38093 return true;
38094 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38095 return true;
38096 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38097 return true;
38098 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38099 return true;
38100 return false;
38103 /* Target hook for c_mode_for_suffix. */
38104 static enum machine_mode
38105 ix86_c_mode_for_suffix (char suffix)
38107 if (suffix == 'q')
38108 return TFmode;
38109 if (suffix == 'w')
38110 return XFmode;
38112 return VOIDmode;
38115 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38117 We do this in the new i386 backend to maintain source compatibility
38118 with the old cc0-based compiler. */
38120 static tree
38121 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38122 tree inputs ATTRIBUTE_UNUSED,
38123 tree clobbers)
38125 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38126 clobbers);
38127 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38128 clobbers);
38129 return clobbers;
38132 /* Implements target vector targetm.asm.encode_section_info. */
38134 static void ATTRIBUTE_UNUSED
38135 ix86_encode_section_info (tree decl, rtx rtl, int first)
38137 default_encode_section_info (decl, rtl, first);
38139 if (TREE_CODE (decl) == VAR_DECL
38140 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38141 && ix86_in_large_data_p (decl))
38142 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38145 /* Worker function for REVERSE_CONDITION. */
38147 enum rtx_code
38148 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38150 return (mode != CCFPmode && mode != CCFPUmode
38151 ? reverse_condition (code)
38152 : reverse_condition_maybe_unordered (code));
38155 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38156 to OPERANDS[0]. */
38158 const char *
38159 output_387_reg_move (rtx insn, rtx *operands)
38161 if (REG_P (operands[0]))
38163 if (REG_P (operands[1])
38164 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38166 if (REGNO (operands[0]) == FIRST_STACK_REG)
38167 return output_387_ffreep (operands, 0);
38168 return "fstp\t%y0";
38170 if (STACK_TOP_P (operands[0]))
38171 return "fld%Z1\t%y1";
38172 return "fst\t%y0";
38174 else if (MEM_P (operands[0]))
38176 gcc_assert (REG_P (operands[1]));
38177 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38178 return "fstp%Z0\t%y0";
38179 else
38181 /* There is no non-popping store to memory for XFmode.
38182 So if we need one, follow the store with a load. */
38183 if (GET_MODE (operands[0]) == XFmode)
38184 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38185 else
38186 return "fst%Z0\t%y0";
38189 else
38190 gcc_unreachable();
38193 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38194 FP status register is set. */
38196 void
38197 ix86_emit_fp_unordered_jump (rtx label)
38199 rtx reg = gen_reg_rtx (HImode);
38200 rtx temp;
38202 emit_insn (gen_x86_fnstsw_1 (reg));
38204 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38206 emit_insn (gen_x86_sahf_1 (reg));
38208 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38209 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38211 else
38213 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38215 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38216 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38219 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38220 gen_rtx_LABEL_REF (VOIDmode, label),
38221 pc_rtx);
38222 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38224 emit_jump_insn (temp);
38225 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38228 /* Output code to perform a log1p XFmode calculation. */
38230 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38232 rtx label1 = gen_label_rtx ();
38233 rtx label2 = gen_label_rtx ();
38235 rtx tmp = gen_reg_rtx (XFmode);
38236 rtx tmp2 = gen_reg_rtx (XFmode);
38237 rtx test;
38239 emit_insn (gen_absxf2 (tmp, op1));
38240 test = gen_rtx_GE (VOIDmode, tmp,
38241 CONST_DOUBLE_FROM_REAL_VALUE (
38242 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38243 XFmode));
38244 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38246 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38247 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38248 emit_jump (label2);
38250 emit_label (label1);
38251 emit_move_insn (tmp, CONST1_RTX (XFmode));
38252 emit_insn (gen_addxf3 (tmp, op1, tmp));
38253 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38254 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38256 emit_label (label2);
38259 /* Emit code for round calculation. */
38260 void ix86_emit_i387_round (rtx op0, rtx op1)
38262 enum machine_mode inmode = GET_MODE (op1);
38263 enum machine_mode outmode = GET_MODE (op0);
38264 rtx e1, e2, res, tmp, tmp1, half;
38265 rtx scratch = gen_reg_rtx (HImode);
38266 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38267 rtx jump_label = gen_label_rtx ();
38268 rtx insn;
38269 rtx (*gen_abs) (rtx, rtx);
38270 rtx (*gen_neg) (rtx, rtx);
38272 switch (inmode)
38274 case SFmode:
38275 gen_abs = gen_abssf2;
38276 break;
38277 case DFmode:
38278 gen_abs = gen_absdf2;
38279 break;
38280 case XFmode:
38281 gen_abs = gen_absxf2;
38282 break;
38283 default:
38284 gcc_unreachable ();
38287 switch (outmode)
38289 case SFmode:
38290 gen_neg = gen_negsf2;
38291 break;
38292 case DFmode:
38293 gen_neg = gen_negdf2;
38294 break;
38295 case XFmode:
38296 gen_neg = gen_negxf2;
38297 break;
38298 case HImode:
38299 gen_neg = gen_neghi2;
38300 break;
38301 case SImode:
38302 gen_neg = gen_negsi2;
38303 break;
38304 case DImode:
38305 gen_neg = gen_negdi2;
38306 break;
38307 default:
38308 gcc_unreachable ();
38311 e1 = gen_reg_rtx (inmode);
38312 e2 = gen_reg_rtx (inmode);
38313 res = gen_reg_rtx (outmode);
38315 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38317 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38319 /* scratch = fxam(op1) */
38320 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38321 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38322 UNSPEC_FXAM)));
38323 /* e1 = fabs(op1) */
38324 emit_insn (gen_abs (e1, op1));
38326 /* e2 = e1 + 0.5 */
38327 half = force_reg (inmode, half);
38328 emit_insn (gen_rtx_SET (VOIDmode, e2,
38329 gen_rtx_PLUS (inmode, e1, half)));
38331 /* res = floor(e2) */
38332 if (inmode != XFmode)
38334 tmp1 = gen_reg_rtx (XFmode);
38336 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38337 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38339 else
38340 tmp1 = e2;
38342 switch (outmode)
38344 case SFmode:
38345 case DFmode:
38347 rtx tmp0 = gen_reg_rtx (XFmode);
38349 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38351 emit_insn (gen_rtx_SET (VOIDmode, res,
38352 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38353 UNSPEC_TRUNC_NOOP)));
38355 break;
38356 case XFmode:
38357 emit_insn (gen_frndintxf2_floor (res, tmp1));
38358 break;
38359 case HImode:
38360 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38361 break;
38362 case SImode:
38363 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38364 break;
38365 case DImode:
38366 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38367 break;
38368 default:
38369 gcc_unreachable ();
38372 /* flags = signbit(a) */
38373 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38375 /* if (flags) then res = -res */
38376 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38377 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38378 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38379 pc_rtx);
38380 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38381 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38382 JUMP_LABEL (insn) = jump_label;
38384 emit_insn (gen_neg (res, res));
38386 emit_label (jump_label);
38387 LABEL_NUSES (jump_label) = 1;
38389 emit_move_insn (op0, res);
38392 /* Output code to perform a Newton-Rhapson approximation of a single precision
38393 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38395 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38397 rtx x0, x1, e0, e1;
38399 x0 = gen_reg_rtx (mode);
38400 e0 = gen_reg_rtx (mode);
38401 e1 = gen_reg_rtx (mode);
38402 x1 = gen_reg_rtx (mode);
38404 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38406 b = force_reg (mode, b);
38408 /* x0 = rcp(b) estimate */
38409 emit_insn (gen_rtx_SET (VOIDmode, x0,
38410 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38411 UNSPEC_RCP)));
38412 /* e0 = x0 * b */
38413 emit_insn (gen_rtx_SET (VOIDmode, e0,
38414 gen_rtx_MULT (mode, x0, b)));
38416 /* e0 = x0 * e0 */
38417 emit_insn (gen_rtx_SET (VOIDmode, e0,
38418 gen_rtx_MULT (mode, x0, e0)));
38420 /* e1 = x0 + x0 */
38421 emit_insn (gen_rtx_SET (VOIDmode, e1,
38422 gen_rtx_PLUS (mode, x0, x0)));
38424 /* x1 = e1 - e0 */
38425 emit_insn (gen_rtx_SET (VOIDmode, x1,
38426 gen_rtx_MINUS (mode, e1, e0)));
38428 /* res = a * x1 */
38429 emit_insn (gen_rtx_SET (VOIDmode, res,
38430 gen_rtx_MULT (mode, a, x1)));
38433 /* Output code to perform a Newton-Rhapson approximation of a
38434 single precision floating point [reciprocal] square root. */
38436 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38437 bool recip)
38439 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38440 REAL_VALUE_TYPE r;
38442 x0 = gen_reg_rtx (mode);
38443 e0 = gen_reg_rtx (mode);
38444 e1 = gen_reg_rtx (mode);
38445 e2 = gen_reg_rtx (mode);
38446 e3 = gen_reg_rtx (mode);
38448 real_from_integer (&r, VOIDmode, -3, -1, 0);
38449 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38451 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38452 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38454 if (VECTOR_MODE_P (mode))
38456 mthree = ix86_build_const_vector (mode, true, mthree);
38457 mhalf = ix86_build_const_vector (mode, true, mhalf);
38460 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38461 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38463 a = force_reg (mode, a);
38465 /* x0 = rsqrt(a) estimate */
38466 emit_insn (gen_rtx_SET (VOIDmode, x0,
38467 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38468 UNSPEC_RSQRT)));
38470 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38471 if (!recip)
38473 rtx zero, mask;
38475 zero = gen_reg_rtx (mode);
38476 mask = gen_reg_rtx (mode);
38478 zero = force_reg (mode, CONST0_RTX(mode));
38479 emit_insn (gen_rtx_SET (VOIDmode, mask,
38480 gen_rtx_NE (mode, zero, a)));
38482 emit_insn (gen_rtx_SET (VOIDmode, x0,
38483 gen_rtx_AND (mode, x0, mask)));
38486 /* e0 = x0 * a */
38487 emit_insn (gen_rtx_SET (VOIDmode, e0,
38488 gen_rtx_MULT (mode, x0, a)));
38489 /* e1 = e0 * x0 */
38490 emit_insn (gen_rtx_SET (VOIDmode, e1,
38491 gen_rtx_MULT (mode, e0, x0)));
38493 /* e2 = e1 - 3. */
38494 mthree = force_reg (mode, mthree);
38495 emit_insn (gen_rtx_SET (VOIDmode, e2,
38496 gen_rtx_PLUS (mode, e1, mthree)));
38498 mhalf = force_reg (mode, mhalf);
38499 if (recip)
38500 /* e3 = -.5 * x0 */
38501 emit_insn (gen_rtx_SET (VOIDmode, e3,
38502 gen_rtx_MULT (mode, x0, mhalf)));
38503 else
38504 /* e3 = -.5 * e0 */
38505 emit_insn (gen_rtx_SET (VOIDmode, e3,
38506 gen_rtx_MULT (mode, e0, mhalf)));
38507 /* ret = e2 * e3 */
38508 emit_insn (gen_rtx_SET (VOIDmode, res,
38509 gen_rtx_MULT (mode, e2, e3)));
38512 #ifdef TARGET_SOLARIS
38513 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38515 static void
38516 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38517 tree decl)
38519 /* With Binutils 2.15, the "@unwind" marker must be specified on
38520 every occurrence of the ".eh_frame" section, not just the first
38521 one. */
38522 if (TARGET_64BIT
38523 && strcmp (name, ".eh_frame") == 0)
38525 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38526 flags & SECTION_WRITE ? "aw" : "a");
38527 return;
38530 #ifndef USE_GAS
38531 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38533 solaris_elf_asm_comdat_section (name, flags, decl);
38534 return;
38536 #endif
38538 default_elf_asm_named_section (name, flags, decl);
38540 #endif /* TARGET_SOLARIS */
38542 /* Return the mangling of TYPE if it is an extended fundamental type. */
38544 static const char *
38545 ix86_mangle_type (const_tree type)
38547 type = TYPE_MAIN_VARIANT (type);
38549 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38550 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38551 return NULL;
38553 switch (TYPE_MODE (type))
38555 case TFmode:
38556 /* __float128 is "g". */
38557 return "g";
38558 case XFmode:
38559 /* "long double" or __float80 is "e". */
38560 return "e";
38561 default:
38562 return NULL;
38566 /* For 32-bit code we can save PIC register setup by using
38567 __stack_chk_fail_local hidden function instead of calling
38568 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38569 register, so it is better to call __stack_chk_fail directly. */
38571 static tree ATTRIBUTE_UNUSED
38572 ix86_stack_protect_fail (void)
38574 return TARGET_64BIT
38575 ? default_external_stack_protect_fail ()
38576 : default_hidden_stack_protect_fail ();
38579 /* Select a format to encode pointers in exception handling data. CODE
38580 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38581 true if the symbol may be affected by dynamic relocations.
38583 ??? All x86 object file formats are capable of representing this.
38584 After all, the relocation needed is the same as for the call insn.
38585 Whether or not a particular assembler allows us to enter such, I
38586 guess we'll have to see. */
38588 asm_preferred_eh_data_format (int code, int global)
38590 if (flag_pic)
38592 int type = DW_EH_PE_sdata8;
38593 if (!TARGET_64BIT
38594 || ix86_cmodel == CM_SMALL_PIC
38595 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38596 type = DW_EH_PE_sdata4;
38597 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38599 if (ix86_cmodel == CM_SMALL
38600 || (ix86_cmodel == CM_MEDIUM && code))
38601 return DW_EH_PE_udata4;
38602 return DW_EH_PE_absptr;
38605 /* Expand copysign from SIGN to the positive value ABS_VALUE
38606 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38607 the sign-bit. */
38608 static void
38609 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38611 enum machine_mode mode = GET_MODE (sign);
38612 rtx sgn = gen_reg_rtx (mode);
38613 if (mask == NULL_RTX)
38615 enum machine_mode vmode;
38617 if (mode == SFmode)
38618 vmode = V4SFmode;
38619 else if (mode == DFmode)
38620 vmode = V2DFmode;
38621 else
38622 vmode = mode;
38624 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38625 if (!VECTOR_MODE_P (mode))
38627 /* We need to generate a scalar mode mask in this case. */
38628 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38629 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38630 mask = gen_reg_rtx (mode);
38631 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38634 else
38635 mask = gen_rtx_NOT (mode, mask);
38636 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38637 gen_rtx_AND (mode, mask, sign)));
38638 emit_insn (gen_rtx_SET (VOIDmode, result,
38639 gen_rtx_IOR (mode, abs_value, sgn)));
38642 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38643 mask for masking out the sign-bit is stored in *SMASK, if that is
38644 non-null. */
38645 static rtx
38646 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38648 enum machine_mode vmode, mode = GET_MODE (op0);
38649 rtx xa, mask;
38651 xa = gen_reg_rtx (mode);
38652 if (mode == SFmode)
38653 vmode = V4SFmode;
38654 else if (mode == DFmode)
38655 vmode = V2DFmode;
38656 else
38657 vmode = mode;
38658 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38659 if (!VECTOR_MODE_P (mode))
38661 /* We need to generate a scalar mode mask in this case. */
38662 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38663 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38664 mask = gen_reg_rtx (mode);
38665 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38667 emit_insn (gen_rtx_SET (VOIDmode, xa,
38668 gen_rtx_AND (mode, op0, mask)));
38670 if (smask)
38671 *smask = mask;
38673 return xa;
38676 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38677 swapping the operands if SWAP_OPERANDS is true. The expanded
38678 code is a forward jump to a newly created label in case the
38679 comparison is true. The generated label rtx is returned. */
38680 static rtx
38681 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38682 bool swap_operands)
38684 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38685 rtx label, tmp;
38687 if (swap_operands)
38689 tmp = op0;
38690 op0 = op1;
38691 op1 = tmp;
38694 label = gen_label_rtx ();
38695 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38696 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38697 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38698 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38699 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38700 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38701 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38702 JUMP_LABEL (tmp) = label;
38704 return label;
38707 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38708 using comparison code CODE. Operands are swapped for the comparison if
38709 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38710 static rtx
38711 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38712 bool swap_operands)
38714 rtx (*insn)(rtx, rtx, rtx, rtx);
38715 enum machine_mode mode = GET_MODE (op0);
38716 rtx mask = gen_reg_rtx (mode);
38718 if (swap_operands)
38720 rtx tmp = op0;
38721 op0 = op1;
38722 op1 = tmp;
38725 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38727 emit_insn (insn (mask, op0, op1,
38728 gen_rtx_fmt_ee (code, mode, op0, op1)));
38729 return mask;
38732 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38733 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38734 static rtx
38735 ix86_gen_TWO52 (enum machine_mode mode)
38737 REAL_VALUE_TYPE TWO52r;
38738 rtx TWO52;
38740 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38741 TWO52 = const_double_from_real_value (TWO52r, mode);
38742 TWO52 = force_reg (mode, TWO52);
38744 return TWO52;
38747 /* Expand SSE sequence for computing lround from OP1 storing
38748 into OP0. */
38749 void
38750 ix86_expand_lround (rtx op0, rtx op1)
38752 /* C code for the stuff we're doing below:
38753 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38754 return (long)tmp;
38756 enum machine_mode mode = GET_MODE (op1);
38757 const struct real_format *fmt;
38758 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38759 rtx adj;
38761 /* load nextafter (0.5, 0.0) */
38762 fmt = REAL_MODE_FORMAT (mode);
38763 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38764 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38766 /* adj = copysign (0.5, op1) */
38767 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38768 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38770 /* adj = op1 + adj */
38771 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38773 /* op0 = (imode)adj */
38774 expand_fix (op0, adj, 0);
38777 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38778 into OPERAND0. */
38779 void
38780 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38782 /* C code for the stuff we're doing below (for do_floor):
38783 xi = (long)op1;
38784 xi -= (double)xi > op1 ? 1 : 0;
38785 return xi;
38787 enum machine_mode fmode = GET_MODE (op1);
38788 enum machine_mode imode = GET_MODE (op0);
38789 rtx ireg, freg, label, tmp;
38791 /* reg = (long)op1 */
38792 ireg = gen_reg_rtx (imode);
38793 expand_fix (ireg, op1, 0);
38795 /* freg = (double)reg */
38796 freg = gen_reg_rtx (fmode);
38797 expand_float (freg, ireg, 0);
38799 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38800 label = ix86_expand_sse_compare_and_jump (UNLE,
38801 freg, op1, !do_floor);
38802 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38803 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38804 emit_move_insn (ireg, tmp);
38806 emit_label (label);
38807 LABEL_NUSES (label) = 1;
38809 emit_move_insn (op0, ireg);
38812 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38813 result in OPERAND0. */
38814 void
38815 ix86_expand_rint (rtx operand0, rtx operand1)
38817 /* C code for the stuff we're doing below:
38818 xa = fabs (operand1);
38819 if (!isless (xa, 2**52))
38820 return operand1;
38821 xa = xa + 2**52 - 2**52;
38822 return copysign (xa, operand1);
38824 enum machine_mode mode = GET_MODE (operand0);
38825 rtx res, xa, label, TWO52, mask;
38827 res = gen_reg_rtx (mode);
38828 emit_move_insn (res, operand1);
38830 /* xa = abs (operand1) */
38831 xa = ix86_expand_sse_fabs (res, &mask);
38833 /* if (!isless (xa, TWO52)) goto label; */
38834 TWO52 = ix86_gen_TWO52 (mode);
38835 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38837 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38838 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38840 ix86_sse_copysign_to_positive (res, xa, res, mask);
38842 emit_label (label);
38843 LABEL_NUSES (label) = 1;
38845 emit_move_insn (operand0, res);
38848 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38849 into OPERAND0. */
38850 void
38851 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38853 /* C code for the stuff we expand below.
38854 double xa = fabs (x), x2;
38855 if (!isless (xa, TWO52))
38856 return x;
38857 xa = xa + TWO52 - TWO52;
38858 x2 = copysign (xa, x);
38859 Compensate. Floor:
38860 if (x2 > x)
38861 x2 -= 1;
38862 Compensate. Ceil:
38863 if (x2 < x)
38864 x2 -= -1;
38865 return x2;
38867 enum machine_mode mode = GET_MODE (operand0);
38868 rtx xa, TWO52, tmp, label, one, res, mask;
38870 TWO52 = ix86_gen_TWO52 (mode);
38872 /* Temporary for holding the result, initialized to the input
38873 operand to ease control flow. */
38874 res = gen_reg_rtx (mode);
38875 emit_move_insn (res, operand1);
38877 /* xa = abs (operand1) */
38878 xa = ix86_expand_sse_fabs (res, &mask);
38880 /* if (!isless (xa, TWO52)) goto label; */
38881 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38883 /* xa = xa + TWO52 - TWO52; */
38884 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38885 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38887 /* xa = copysign (xa, operand1) */
38888 ix86_sse_copysign_to_positive (xa, xa, res, mask);
38890 /* generate 1.0 or -1.0 */
38891 one = force_reg (mode,
38892 const_double_from_real_value (do_floor
38893 ? dconst1 : dconstm1, mode));
38895 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38896 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38897 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38898 gen_rtx_AND (mode, one, tmp)));
38899 /* We always need to subtract here to preserve signed zero. */
38900 tmp = expand_simple_binop (mode, MINUS,
38901 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38902 emit_move_insn (res, tmp);
38904 emit_label (label);
38905 LABEL_NUSES (label) = 1;
38907 emit_move_insn (operand0, res);
38910 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38911 into OPERAND0. */
38912 void
38913 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
38915 /* C code for the stuff we expand below.
38916 double xa = fabs (x), x2;
38917 if (!isless (xa, TWO52))
38918 return x;
38919 x2 = (double)(long)x;
38920 Compensate. Floor:
38921 if (x2 > x)
38922 x2 -= 1;
38923 Compensate. Ceil:
38924 if (x2 < x)
38925 x2 += 1;
38926 if (HONOR_SIGNED_ZEROS (mode))
38927 return copysign (x2, x);
38928 return x2;
38930 enum machine_mode mode = GET_MODE (operand0);
38931 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38933 TWO52 = ix86_gen_TWO52 (mode);
38935 /* Temporary for holding the result, initialized to the input
38936 operand to ease control flow. */
38937 res = gen_reg_rtx (mode);
38938 emit_move_insn (res, operand1);
38940 /* xa = abs (operand1) */
38941 xa = ix86_expand_sse_fabs (res, &mask);
38943 /* if (!isless (xa, TWO52)) goto label; */
38944 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38946 /* xa = (double)(long)x */
38947 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38948 expand_fix (xi, res, 0);
38949 expand_float (xa, xi, 0);
38951 /* generate 1.0 */
38952 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38954 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38955 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38956 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38957 gen_rtx_AND (mode, one, tmp)));
38958 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38959 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38960 emit_move_insn (res, tmp);
38962 if (HONOR_SIGNED_ZEROS (mode))
38963 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38965 emit_label (label);
38966 LABEL_NUSES (label) = 1;
38968 emit_move_insn (operand0, res);
38971 /* Expand SSE sequence for computing round from OPERAND1 storing
38972 into OPERAND0. Sequence that works without relying on DImode truncation
38973 via cvttsd2siq that is only available on 64bit targets. */
38974 void
38975 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38977 /* C code for the stuff we expand below.
38978 double xa = fabs (x), xa2, x2;
38979 if (!isless (xa, TWO52))
38980 return x;
38981 Using the absolute value and copying back sign makes
38982 -0.0 -> -0.0 correct.
38983 xa2 = xa + TWO52 - TWO52;
38984 Compensate.
38985 dxa = xa2 - xa;
38986 if (dxa <= -0.5)
38987 xa2 += 1;
38988 else if (dxa > 0.5)
38989 xa2 -= 1;
38990 x2 = copysign (xa2, x);
38991 return x2;
38993 enum machine_mode mode = GET_MODE (operand0);
38994 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38996 TWO52 = ix86_gen_TWO52 (mode);
38998 /* Temporary for holding the result, initialized to the input
38999 operand to ease control flow. */
39000 res = gen_reg_rtx (mode);
39001 emit_move_insn (res, operand1);
39003 /* xa = abs (operand1) */
39004 xa = ix86_expand_sse_fabs (res, &mask);
39006 /* if (!isless (xa, TWO52)) goto label; */
39007 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39009 /* xa2 = xa + TWO52 - TWO52; */
39010 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39011 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39013 /* dxa = xa2 - xa; */
39014 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39016 /* generate 0.5, 1.0 and -0.5 */
39017 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39018 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39019 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39020 0, OPTAB_DIRECT);
39022 /* Compensate. */
39023 tmp = gen_reg_rtx (mode);
39024 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39025 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39026 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39027 gen_rtx_AND (mode, one, tmp)));
39028 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39029 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39030 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39031 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39032 gen_rtx_AND (mode, one, tmp)));
39033 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39035 /* res = copysign (xa2, operand1) */
39036 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39038 emit_label (label);
39039 LABEL_NUSES (label) = 1;
39041 emit_move_insn (operand0, res);
39044 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39045 into OPERAND0. */
39046 void
39047 ix86_expand_trunc (rtx operand0, rtx operand1)
39049 /* C code for SSE variant we expand below.
39050 double xa = fabs (x), x2;
39051 if (!isless (xa, TWO52))
39052 return x;
39053 x2 = (double)(long)x;
39054 if (HONOR_SIGNED_ZEROS (mode))
39055 return copysign (x2, x);
39056 return x2;
39058 enum machine_mode mode = GET_MODE (operand0);
39059 rtx xa, xi, TWO52, label, res, mask;
39061 TWO52 = ix86_gen_TWO52 (mode);
39063 /* Temporary for holding the result, initialized to the input
39064 operand to ease control flow. */
39065 res = gen_reg_rtx (mode);
39066 emit_move_insn (res, operand1);
39068 /* xa = abs (operand1) */
39069 xa = ix86_expand_sse_fabs (res, &mask);
39071 /* if (!isless (xa, TWO52)) goto label; */
39072 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39074 /* x = (double)(long)x */
39075 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39076 expand_fix (xi, res, 0);
39077 expand_float (res, xi, 0);
39079 if (HONOR_SIGNED_ZEROS (mode))
39080 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39082 emit_label (label);
39083 LABEL_NUSES (label) = 1;
39085 emit_move_insn (operand0, res);
39088 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39089 into OPERAND0. */
39090 void
39091 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39093 enum machine_mode mode = GET_MODE (operand0);
39094 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39096 /* C code for SSE variant we expand below.
39097 double xa = fabs (x), x2;
39098 if (!isless (xa, TWO52))
39099 return x;
39100 xa2 = xa + TWO52 - TWO52;
39101 Compensate:
39102 if (xa2 > xa)
39103 xa2 -= 1.0;
39104 x2 = copysign (xa2, x);
39105 return x2;
39108 TWO52 = ix86_gen_TWO52 (mode);
39110 /* Temporary for holding the result, initialized to the input
39111 operand to ease control flow. */
39112 res = gen_reg_rtx (mode);
39113 emit_move_insn (res, operand1);
39115 /* xa = abs (operand1) */
39116 xa = ix86_expand_sse_fabs (res, &smask);
39118 /* if (!isless (xa, TWO52)) goto label; */
39119 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39121 /* res = xa + TWO52 - TWO52; */
39122 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39123 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39124 emit_move_insn (res, tmp);
39126 /* generate 1.0 */
39127 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39129 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39130 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39131 emit_insn (gen_rtx_SET (VOIDmode, mask,
39132 gen_rtx_AND (mode, mask, one)));
39133 tmp = expand_simple_binop (mode, MINUS,
39134 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39135 emit_move_insn (res, tmp);
39137 /* res = copysign (res, operand1) */
39138 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39140 emit_label (label);
39141 LABEL_NUSES (label) = 1;
39143 emit_move_insn (operand0, res);
39146 /* Expand SSE sequence for computing round from OPERAND1 storing
39147 into OPERAND0. */
39148 void
39149 ix86_expand_round (rtx operand0, rtx operand1)
39151 /* C code for the stuff we're doing below:
39152 double xa = fabs (x);
39153 if (!isless (xa, TWO52))
39154 return x;
39155 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39156 return copysign (xa, x);
39158 enum machine_mode mode = GET_MODE (operand0);
39159 rtx res, TWO52, xa, label, xi, half, mask;
39160 const struct real_format *fmt;
39161 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39163 /* Temporary for holding the result, initialized to the input
39164 operand to ease control flow. */
39165 res = gen_reg_rtx (mode);
39166 emit_move_insn (res, operand1);
39168 TWO52 = ix86_gen_TWO52 (mode);
39169 xa = ix86_expand_sse_fabs (res, &mask);
39170 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39172 /* load nextafter (0.5, 0.0) */
39173 fmt = REAL_MODE_FORMAT (mode);
39174 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39175 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39177 /* xa = xa + 0.5 */
39178 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39179 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39181 /* xa = (double)(int64_t)xa */
39182 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39183 expand_fix (xi, xa, 0);
39184 expand_float (xa, xi, 0);
39186 /* res = copysign (xa, operand1) */
39187 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39189 emit_label (label);
39190 LABEL_NUSES (label) = 1;
39192 emit_move_insn (operand0, res);
39195 /* Expand SSE sequence for computing round
39196 from OP1 storing into OP0 using sse4 round insn. */
39197 void
39198 ix86_expand_round_sse4 (rtx op0, rtx op1)
39200 enum machine_mode mode = GET_MODE (op0);
39201 rtx e1, e2, res, half;
39202 const struct real_format *fmt;
39203 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39204 rtx (*gen_copysign) (rtx, rtx, rtx);
39205 rtx (*gen_round) (rtx, rtx, rtx);
39207 switch (mode)
39209 case SFmode:
39210 gen_copysign = gen_copysignsf3;
39211 gen_round = gen_sse4_1_roundsf2;
39212 break;
39213 case DFmode:
39214 gen_copysign = gen_copysigndf3;
39215 gen_round = gen_sse4_1_rounddf2;
39216 break;
39217 default:
39218 gcc_unreachable ();
39221 /* round (a) = trunc (a + copysign (0.5, a)) */
39223 /* load nextafter (0.5, 0.0) */
39224 fmt = REAL_MODE_FORMAT (mode);
39225 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39226 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39227 half = const_double_from_real_value (pred_half, mode);
39229 /* e1 = copysign (0.5, op1) */
39230 e1 = gen_reg_rtx (mode);
39231 emit_insn (gen_copysign (e1, half, op1));
39233 /* e2 = op1 + e1 */
39234 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39236 /* res = trunc (e2) */
39237 res = gen_reg_rtx (mode);
39238 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39240 emit_move_insn (op0, res);
39244 /* Table of valid machine attributes. */
39245 static const struct attribute_spec ix86_attribute_table[] =
39247 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39248 affects_type_identity } */
39249 /* Stdcall attribute says callee is responsible for popping arguments
39250 if they are not variable. */
39251 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39252 true },
39253 /* Fastcall attribute says callee is responsible for popping arguments
39254 if they are not variable. */
39255 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39256 true },
39257 /* Thiscall attribute says callee is responsible for popping arguments
39258 if they are not variable. */
39259 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39260 true },
39261 /* Cdecl attribute says the callee is a normal C declaration */
39262 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39263 true },
39264 /* Regparm attribute specifies how many integer arguments are to be
39265 passed in registers. */
39266 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39267 true },
39268 /* Sseregparm attribute says we are using x86_64 calling conventions
39269 for FP arguments. */
39270 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39271 true },
39272 /* The transactional memory builtins are implicitly regparm or fastcall
39273 depending on the ABI. Override the generic do-nothing attribute that
39274 these builtins were declared with. */
39275 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39276 true },
39277 /* force_align_arg_pointer says this function realigns the stack at entry. */
39278 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39279 false, true, true, ix86_handle_cconv_attribute, false },
39280 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39281 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39282 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39283 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39284 false },
39285 #endif
39286 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39287 false },
39288 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39289 false },
39290 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39291 SUBTARGET_ATTRIBUTE_TABLE,
39292 #endif
39293 /* ms_abi and sysv_abi calling convention function attributes. */
39294 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39295 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39296 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39297 false },
39298 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39299 ix86_handle_callee_pop_aggregate_return, true },
39300 /* End element. */
39301 { NULL, 0, 0, false, false, false, NULL, false }
39304 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39305 static int
39306 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39307 tree vectype,
39308 int misalign ATTRIBUTE_UNUSED)
39310 unsigned elements;
39312 switch (type_of_cost)
39314 case scalar_stmt:
39315 return ix86_cost->scalar_stmt_cost;
39317 case scalar_load:
39318 return ix86_cost->scalar_load_cost;
39320 case scalar_store:
39321 return ix86_cost->scalar_store_cost;
39323 case vector_stmt:
39324 return ix86_cost->vec_stmt_cost;
39326 case vector_load:
39327 return ix86_cost->vec_align_load_cost;
39329 case vector_store:
39330 return ix86_cost->vec_store_cost;
39332 case vec_to_scalar:
39333 return ix86_cost->vec_to_scalar_cost;
39335 case scalar_to_vec:
39336 return ix86_cost->scalar_to_vec_cost;
39338 case unaligned_load:
39339 case unaligned_store:
39340 return ix86_cost->vec_unalign_load_cost;
39342 case cond_branch_taken:
39343 return ix86_cost->cond_taken_branch_cost;
39345 case cond_branch_not_taken:
39346 return ix86_cost->cond_not_taken_branch_cost;
39348 case vec_perm:
39349 case vec_promote_demote:
39350 return ix86_cost->vec_stmt_cost;
39352 case vec_construct:
39353 elements = TYPE_VECTOR_SUBPARTS (vectype);
39354 return elements / 2 + 1;
39356 default:
39357 gcc_unreachable ();
39361 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39362 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39363 insn every time. */
39365 static GTY(()) rtx vselect_insn;
39367 /* Initialize vselect_insn. */
39369 static void
39370 init_vselect_insn (void)
39372 unsigned i;
39373 rtx x;
39375 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39376 for (i = 0; i < MAX_VECT_LEN; ++i)
39377 XVECEXP (x, 0, i) = const0_rtx;
39378 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39379 const0_rtx), x);
39380 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39381 start_sequence ();
39382 vselect_insn = emit_insn (x);
39383 end_sequence ();
39386 /* Construct (set target (vec_select op0 (parallel perm))) and
39387 return true if that's a valid instruction in the active ISA. */
39389 static bool
39390 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39391 unsigned nelt, bool testing_p)
39393 unsigned int i;
39394 rtx x, save_vconcat;
39395 int icode;
39397 if (vselect_insn == NULL_RTX)
39398 init_vselect_insn ();
39400 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39401 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39402 for (i = 0; i < nelt; ++i)
39403 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39404 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39405 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39406 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39407 SET_DEST (PATTERN (vselect_insn)) = target;
39408 icode = recog_memoized (vselect_insn);
39410 if (icode >= 0 && !testing_p)
39411 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39413 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39414 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39415 INSN_CODE (vselect_insn) = -1;
39417 return icode >= 0;
39420 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39422 static bool
39423 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39424 const unsigned char *perm, unsigned nelt,
39425 bool testing_p)
39427 enum machine_mode v2mode;
39428 rtx x;
39429 bool ok;
39431 if (vselect_insn == NULL_RTX)
39432 init_vselect_insn ();
39434 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39435 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39436 PUT_MODE (x, v2mode);
39437 XEXP (x, 0) = op0;
39438 XEXP (x, 1) = op1;
39439 ok = expand_vselect (target, x, perm, nelt, testing_p);
39440 XEXP (x, 0) = const0_rtx;
39441 XEXP (x, 1) = const0_rtx;
39442 return ok;
39445 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39446 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39448 static bool
39449 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39451 enum machine_mode vmode = d->vmode;
39452 unsigned i, mask, nelt = d->nelt;
39453 rtx target, op0, op1, x;
39454 rtx rperm[32], vperm;
39456 if (d->one_operand_p)
39457 return false;
39458 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39460 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39462 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39464 else
39465 return false;
39467 /* This is a blend, not a permute. Elements must stay in their
39468 respective lanes. */
39469 for (i = 0; i < nelt; ++i)
39471 unsigned e = d->perm[i];
39472 if (!(e == i || e == i + nelt))
39473 return false;
39476 if (d->testing_p)
39477 return true;
39479 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39480 decision should be extracted elsewhere, so that we only try that
39481 sequence once all budget==3 options have been tried. */
39482 target = d->target;
39483 op0 = d->op0;
39484 op1 = d->op1;
39485 mask = 0;
39487 switch (vmode)
39489 case V4DFmode:
39490 case V8SFmode:
39491 case V2DFmode:
39492 case V4SFmode:
39493 case V8HImode:
39494 case V8SImode:
39495 for (i = 0; i < nelt; ++i)
39496 mask |= (d->perm[i] >= nelt) << i;
39497 break;
39499 case V2DImode:
39500 for (i = 0; i < 2; ++i)
39501 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39502 vmode = V8HImode;
39503 goto do_subreg;
39505 case V4SImode:
39506 for (i = 0; i < 4; ++i)
39507 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39508 vmode = V8HImode;
39509 goto do_subreg;
39511 case V16QImode:
39512 /* See if bytes move in pairs so we can use pblendw with
39513 an immediate argument, rather than pblendvb with a vector
39514 argument. */
39515 for (i = 0; i < 16; i += 2)
39516 if (d->perm[i] + 1 != d->perm[i + 1])
39518 use_pblendvb:
39519 for (i = 0; i < nelt; ++i)
39520 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39522 finish_pblendvb:
39523 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39524 vperm = force_reg (vmode, vperm);
39526 if (GET_MODE_SIZE (vmode) == 16)
39527 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39528 else
39529 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39530 if (target != d->target)
39531 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39532 return true;
39535 for (i = 0; i < 8; ++i)
39536 mask |= (d->perm[i * 2] >= 16) << i;
39537 vmode = V8HImode;
39538 /* FALLTHRU */
39540 do_subreg:
39541 target = gen_reg_rtx (vmode);
39542 op0 = gen_lowpart (vmode, op0);
39543 op1 = gen_lowpart (vmode, op1);
39544 break;
39546 case V32QImode:
39547 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39548 for (i = 0; i < 32; i += 2)
39549 if (d->perm[i] + 1 != d->perm[i + 1])
39550 goto use_pblendvb;
39551 /* See if bytes move in quadruplets. If yes, vpblendd
39552 with immediate can be used. */
39553 for (i = 0; i < 32; i += 4)
39554 if (d->perm[i] + 2 != d->perm[i + 2])
39555 break;
39556 if (i < 32)
39558 /* See if bytes move the same in both lanes. If yes,
39559 vpblendw with immediate can be used. */
39560 for (i = 0; i < 16; i += 2)
39561 if (d->perm[i] + 16 != d->perm[i + 16])
39562 goto use_pblendvb;
39564 /* Use vpblendw. */
39565 for (i = 0; i < 16; ++i)
39566 mask |= (d->perm[i * 2] >= 32) << i;
39567 vmode = V16HImode;
39568 goto do_subreg;
39571 /* Use vpblendd. */
39572 for (i = 0; i < 8; ++i)
39573 mask |= (d->perm[i * 4] >= 32) << i;
39574 vmode = V8SImode;
39575 goto do_subreg;
39577 case V16HImode:
39578 /* See if words move in pairs. If yes, vpblendd can be used. */
39579 for (i = 0; i < 16; i += 2)
39580 if (d->perm[i] + 1 != d->perm[i + 1])
39581 break;
39582 if (i < 16)
39584 /* See if words move the same in both lanes. If not,
39585 vpblendvb must be used. */
39586 for (i = 0; i < 8; i++)
39587 if (d->perm[i] + 8 != d->perm[i + 8])
39589 /* Use vpblendvb. */
39590 for (i = 0; i < 32; ++i)
39591 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39593 vmode = V32QImode;
39594 nelt = 32;
39595 target = gen_reg_rtx (vmode);
39596 op0 = gen_lowpart (vmode, op0);
39597 op1 = gen_lowpart (vmode, op1);
39598 goto finish_pblendvb;
39601 /* Use vpblendw. */
39602 for (i = 0; i < 16; ++i)
39603 mask |= (d->perm[i] >= 16) << i;
39604 break;
39607 /* Use vpblendd. */
39608 for (i = 0; i < 8; ++i)
39609 mask |= (d->perm[i * 2] >= 16) << i;
39610 vmode = V8SImode;
39611 goto do_subreg;
39613 case V4DImode:
39614 /* Use vpblendd. */
39615 for (i = 0; i < 4; ++i)
39616 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39617 vmode = V8SImode;
39618 goto do_subreg;
39620 default:
39621 gcc_unreachable ();
39624 /* This matches five different patterns with the different modes. */
39625 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39626 x = gen_rtx_SET (VOIDmode, target, x);
39627 emit_insn (x);
39628 if (target != d->target)
39629 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39631 return true;
39634 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39635 in terms of the variable form of vpermilps.
39637 Note that we will have already failed the immediate input vpermilps,
39638 which requires that the high and low part shuffle be identical; the
39639 variable form doesn't require that. */
39641 static bool
39642 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39644 rtx rperm[8], vperm;
39645 unsigned i;
39647 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39648 return false;
39650 /* We can only permute within the 128-bit lane. */
39651 for (i = 0; i < 8; ++i)
39653 unsigned e = d->perm[i];
39654 if (i < 4 ? e >= 4 : e < 4)
39655 return false;
39658 if (d->testing_p)
39659 return true;
39661 for (i = 0; i < 8; ++i)
39663 unsigned e = d->perm[i];
39665 /* Within each 128-bit lane, the elements of op0 are numbered
39666 from 0 and the elements of op1 are numbered from 4. */
39667 if (e >= 8 + 4)
39668 e -= 8;
39669 else if (e >= 4)
39670 e -= 4;
39672 rperm[i] = GEN_INT (e);
39675 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39676 vperm = force_reg (V8SImode, vperm);
39677 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39679 return true;
39682 /* Return true if permutation D can be performed as VMODE permutation
39683 instead. */
39685 static bool
39686 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39688 unsigned int i, j, chunk;
39690 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39691 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39692 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39693 return false;
39695 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39696 return true;
39698 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39699 for (i = 0; i < d->nelt; i += chunk)
39700 if (d->perm[i] & (chunk - 1))
39701 return false;
39702 else
39703 for (j = 1; j < chunk; ++j)
39704 if (d->perm[i] + j != d->perm[i + j])
39705 return false;
39707 return true;
39710 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39711 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39713 static bool
39714 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39716 unsigned i, nelt, eltsz, mask;
39717 unsigned char perm[32];
39718 enum machine_mode vmode = V16QImode;
39719 rtx rperm[32], vperm, target, op0, op1;
39721 nelt = d->nelt;
39723 if (!d->one_operand_p)
39725 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39727 if (TARGET_AVX2
39728 && valid_perm_using_mode_p (V2TImode, d))
39730 if (d->testing_p)
39731 return true;
39733 /* Use vperm2i128 insn. The pattern uses
39734 V4DImode instead of V2TImode. */
39735 target = d->target;
39736 if (d->vmode != V4DImode)
39737 target = gen_reg_rtx (V4DImode);
39738 op0 = gen_lowpart (V4DImode, d->op0);
39739 op1 = gen_lowpart (V4DImode, d->op1);
39740 rperm[0]
39741 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39742 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39743 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39744 if (target != d->target)
39745 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39746 return true;
39748 return false;
39751 else
39753 if (GET_MODE_SIZE (d->vmode) == 16)
39755 if (!TARGET_SSSE3)
39756 return false;
39758 else if (GET_MODE_SIZE (d->vmode) == 32)
39760 if (!TARGET_AVX2)
39761 return false;
39763 /* V4DImode should be already handled through
39764 expand_vselect by vpermq instruction. */
39765 gcc_assert (d->vmode != V4DImode);
39767 vmode = V32QImode;
39768 if (d->vmode == V8SImode
39769 || d->vmode == V16HImode
39770 || d->vmode == V32QImode)
39772 /* First see if vpermq can be used for
39773 V8SImode/V16HImode/V32QImode. */
39774 if (valid_perm_using_mode_p (V4DImode, d))
39776 for (i = 0; i < 4; i++)
39777 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39778 if (d->testing_p)
39779 return true;
39780 target = gen_reg_rtx (V4DImode);
39781 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
39782 perm, 4, false))
39784 emit_move_insn (d->target,
39785 gen_lowpart (d->vmode, target));
39786 return true;
39788 return false;
39791 /* Next see if vpermd can be used. */
39792 if (valid_perm_using_mode_p (V8SImode, d))
39793 vmode = V8SImode;
39795 /* Or if vpermps can be used. */
39796 else if (d->vmode == V8SFmode)
39797 vmode = V8SImode;
39799 if (vmode == V32QImode)
39801 /* vpshufb only works intra lanes, it is not
39802 possible to shuffle bytes in between the lanes. */
39803 for (i = 0; i < nelt; ++i)
39804 if ((d->perm[i] ^ i) & (nelt / 2))
39805 return false;
39808 else
39809 return false;
39812 if (d->testing_p)
39813 return true;
39815 if (vmode == V8SImode)
39816 for (i = 0; i < 8; ++i)
39817 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39818 else
39820 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39821 if (!d->one_operand_p)
39822 mask = 2 * nelt - 1;
39823 else if (vmode == V16QImode)
39824 mask = nelt - 1;
39825 else
39826 mask = nelt / 2 - 1;
39828 for (i = 0; i < nelt; ++i)
39830 unsigned j, e = d->perm[i] & mask;
39831 for (j = 0; j < eltsz; ++j)
39832 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39836 vperm = gen_rtx_CONST_VECTOR (vmode,
39837 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39838 vperm = force_reg (vmode, vperm);
39840 target = d->target;
39841 if (d->vmode != vmode)
39842 target = gen_reg_rtx (vmode);
39843 op0 = gen_lowpart (vmode, d->op0);
39844 if (d->one_operand_p)
39846 if (vmode == V16QImode)
39847 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39848 else if (vmode == V32QImode)
39849 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39850 else if (vmode == V8SFmode)
39851 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39852 else
39853 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39855 else
39857 op1 = gen_lowpart (vmode, d->op1);
39858 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39860 if (target != d->target)
39861 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39863 return true;
39866 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39867 in a single instruction. */
39869 static bool
39870 expand_vec_perm_1 (struct expand_vec_perm_d *d)
39872 unsigned i, nelt = d->nelt;
39873 unsigned char perm2[MAX_VECT_LEN];
39875 /* Check plain VEC_SELECT first, because AVX has instructions that could
39876 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
39877 input where SEL+CONCAT may not. */
39878 if (d->one_operand_p)
39880 int mask = nelt - 1;
39881 bool identity_perm = true;
39882 bool broadcast_perm = true;
39884 for (i = 0; i < nelt; i++)
39886 perm2[i] = d->perm[i] & mask;
39887 if (perm2[i] != i)
39888 identity_perm = false;
39889 if (perm2[i])
39890 broadcast_perm = false;
39893 if (identity_perm)
39895 if (!d->testing_p)
39896 emit_move_insn (d->target, d->op0);
39897 return true;
39899 else if (broadcast_perm && TARGET_AVX2)
39901 /* Use vpbroadcast{b,w,d}. */
39902 rtx (*gen) (rtx, rtx) = NULL;
39903 switch (d->vmode)
39905 case V32QImode:
39906 gen = gen_avx2_pbroadcastv32qi_1;
39907 break;
39908 case V16HImode:
39909 gen = gen_avx2_pbroadcastv16hi_1;
39910 break;
39911 case V8SImode:
39912 gen = gen_avx2_pbroadcastv8si_1;
39913 break;
39914 case V16QImode:
39915 gen = gen_avx2_pbroadcastv16qi;
39916 break;
39917 case V8HImode:
39918 gen = gen_avx2_pbroadcastv8hi;
39919 break;
39920 case V8SFmode:
39921 gen = gen_avx2_vec_dupv8sf_1;
39922 break;
39923 /* For other modes prefer other shuffles this function creates. */
39924 default: break;
39926 if (gen != NULL)
39928 if (!d->testing_p)
39929 emit_insn (gen (d->target, d->op0));
39930 return true;
39934 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
39935 return true;
39937 /* There are plenty of patterns in sse.md that are written for
39938 SEL+CONCAT and are not replicated for a single op. Perhaps
39939 that should be changed, to avoid the nastiness here. */
39941 /* Recognize interleave style patterns, which means incrementing
39942 every other permutation operand. */
39943 for (i = 0; i < nelt; i += 2)
39945 perm2[i] = d->perm[i] & mask;
39946 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39948 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39949 d->testing_p))
39950 return true;
39952 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39953 if (nelt >= 4)
39955 for (i = 0; i < nelt; i += 4)
39957 perm2[i + 0] = d->perm[i + 0] & mask;
39958 perm2[i + 1] = d->perm[i + 1] & mask;
39959 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39960 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39963 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39964 d->testing_p))
39965 return true;
39969 /* Finally, try the fully general two operand permute. */
39970 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39971 d->testing_p))
39972 return true;
39974 /* Recognize interleave style patterns with reversed operands. */
39975 if (!d->one_operand_p)
39977 for (i = 0; i < nelt; ++i)
39979 unsigned e = d->perm[i];
39980 if (e >= nelt)
39981 e -= nelt;
39982 else
39983 e += nelt;
39984 perm2[i] = e;
39987 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39988 d->testing_p))
39989 return true;
39992 /* Try the SSE4.1 blend variable merge instructions. */
39993 if (expand_vec_perm_blend (d))
39994 return true;
39996 /* Try one of the AVX vpermil variable permutations. */
39997 if (expand_vec_perm_vpermil (d))
39998 return true;
40000 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40001 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40002 if (expand_vec_perm_pshufb (d))
40003 return true;
40005 return false;
40008 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40009 in terms of a pair of pshuflw + pshufhw instructions. */
40011 static bool
40012 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40014 unsigned char perm2[MAX_VECT_LEN];
40015 unsigned i;
40016 bool ok;
40018 if (d->vmode != V8HImode || !d->one_operand_p)
40019 return false;
40021 /* The two permutations only operate in 64-bit lanes. */
40022 for (i = 0; i < 4; ++i)
40023 if (d->perm[i] >= 4)
40024 return false;
40025 for (i = 4; i < 8; ++i)
40026 if (d->perm[i] < 4)
40027 return false;
40029 if (d->testing_p)
40030 return true;
40032 /* Emit the pshuflw. */
40033 memcpy (perm2, d->perm, 4);
40034 for (i = 4; i < 8; ++i)
40035 perm2[i] = i;
40036 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40037 gcc_assert (ok);
40039 /* Emit the pshufhw. */
40040 memcpy (perm2 + 4, d->perm + 4, 4);
40041 for (i = 0; i < 4; ++i)
40042 perm2[i] = i;
40043 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40044 gcc_assert (ok);
40046 return true;
40049 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40050 the permutation using the SSSE3 palignr instruction. This succeeds
40051 when all of the elements in PERM fit within one vector and we merely
40052 need to shift them down so that a single vector permutation has a
40053 chance to succeed. */
40055 static bool
40056 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40058 unsigned i, nelt = d->nelt;
40059 unsigned min, max;
40060 bool in_order, ok;
40061 rtx shift, target;
40062 struct expand_vec_perm_d dcopy;
40064 /* Even with AVX, palignr only operates on 128-bit vectors. */
40065 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40066 return false;
40068 min = nelt, max = 0;
40069 for (i = 0; i < nelt; ++i)
40071 unsigned e = d->perm[i];
40072 if (e < min)
40073 min = e;
40074 if (e > max)
40075 max = e;
40077 if (min == 0 || max - min >= nelt)
40078 return false;
40080 /* Given that we have SSSE3, we know we'll be able to implement the
40081 single operand permutation after the palignr with pshufb. */
40082 if (d->testing_p)
40083 return true;
40085 dcopy = *d;
40086 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40087 target = gen_reg_rtx (TImode);
40088 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40089 gen_lowpart (TImode, d->op0), shift));
40091 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40092 dcopy.one_operand_p = true;
40094 in_order = true;
40095 for (i = 0; i < nelt; ++i)
40097 unsigned e = dcopy.perm[i] - min;
40098 if (e != i)
40099 in_order = false;
40100 dcopy.perm[i] = e;
40103 /* Test for the degenerate case where the alignment by itself
40104 produces the desired permutation. */
40105 if (in_order)
40107 emit_move_insn (d->target, dcopy.op0);
40108 return true;
40111 ok = expand_vec_perm_1 (&dcopy);
40112 gcc_assert (ok);
40114 return ok;
40117 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40119 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40120 a two vector permutation into a single vector permutation by using
40121 an interleave operation to merge the vectors. */
40123 static bool
40124 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40126 struct expand_vec_perm_d dremap, dfinal;
40127 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40128 unsigned HOST_WIDE_INT contents;
40129 unsigned char remap[2 * MAX_VECT_LEN];
40130 rtx seq;
40131 bool ok, same_halves = false;
40133 if (GET_MODE_SIZE (d->vmode) == 16)
40135 if (d->one_operand_p)
40136 return false;
40138 else if (GET_MODE_SIZE (d->vmode) == 32)
40140 if (!TARGET_AVX)
40141 return false;
40142 /* For 32-byte modes allow even d->one_operand_p.
40143 The lack of cross-lane shuffling in some instructions
40144 might prevent a single insn shuffle. */
40145 dfinal = *d;
40146 dfinal.testing_p = true;
40147 /* If expand_vec_perm_interleave3 can expand this into
40148 a 3 insn sequence, give up and let it be expanded as
40149 3 insn sequence. While that is one insn longer,
40150 it doesn't need a memory operand and in the common
40151 case that both interleave low and high permutations
40152 with the same operands are adjacent needs 4 insns
40153 for both after CSE. */
40154 if (expand_vec_perm_interleave3 (&dfinal))
40155 return false;
40157 else
40158 return false;
40160 /* Examine from whence the elements come. */
40161 contents = 0;
40162 for (i = 0; i < nelt; ++i)
40163 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40165 memset (remap, 0xff, sizeof (remap));
40166 dremap = *d;
40168 if (GET_MODE_SIZE (d->vmode) == 16)
40170 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40172 /* Split the two input vectors into 4 halves. */
40173 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40174 h2 = h1 << nelt2;
40175 h3 = h2 << nelt2;
40176 h4 = h3 << nelt2;
40178 /* If the elements from the low halves use interleave low, and similarly
40179 for interleave high. If the elements are from mis-matched halves, we
40180 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40181 if ((contents & (h1 | h3)) == contents)
40183 /* punpckl* */
40184 for (i = 0; i < nelt2; ++i)
40186 remap[i] = i * 2;
40187 remap[i + nelt] = i * 2 + 1;
40188 dremap.perm[i * 2] = i;
40189 dremap.perm[i * 2 + 1] = i + nelt;
40191 if (!TARGET_SSE2 && d->vmode == V4SImode)
40192 dremap.vmode = V4SFmode;
40194 else if ((contents & (h2 | h4)) == contents)
40196 /* punpckh* */
40197 for (i = 0; i < nelt2; ++i)
40199 remap[i + nelt2] = i * 2;
40200 remap[i + nelt + nelt2] = i * 2 + 1;
40201 dremap.perm[i * 2] = i + nelt2;
40202 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40204 if (!TARGET_SSE2 && d->vmode == V4SImode)
40205 dremap.vmode = V4SFmode;
40207 else if ((contents & (h1 | h4)) == contents)
40209 /* shufps */
40210 for (i = 0; i < nelt2; ++i)
40212 remap[i] = i;
40213 remap[i + nelt + nelt2] = i + nelt2;
40214 dremap.perm[i] = i;
40215 dremap.perm[i + nelt2] = i + nelt + nelt2;
40217 if (nelt != 4)
40219 /* shufpd */
40220 dremap.vmode = V2DImode;
40221 dremap.nelt = 2;
40222 dremap.perm[0] = 0;
40223 dremap.perm[1] = 3;
40226 else if ((contents & (h2 | h3)) == contents)
40228 /* shufps */
40229 for (i = 0; i < nelt2; ++i)
40231 remap[i + nelt2] = i;
40232 remap[i + nelt] = i + nelt2;
40233 dremap.perm[i] = i + nelt2;
40234 dremap.perm[i + nelt2] = i + nelt;
40236 if (nelt != 4)
40238 /* shufpd */
40239 dremap.vmode = V2DImode;
40240 dremap.nelt = 2;
40241 dremap.perm[0] = 1;
40242 dremap.perm[1] = 2;
40245 else
40246 return false;
40248 else
40250 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40251 unsigned HOST_WIDE_INT q[8];
40252 unsigned int nonzero_halves[4];
40254 /* Split the two input vectors into 8 quarters. */
40255 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40256 for (i = 1; i < 8; ++i)
40257 q[i] = q[0] << (nelt4 * i);
40258 for (i = 0; i < 4; ++i)
40259 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40261 nonzero_halves[nzcnt] = i;
40262 ++nzcnt;
40265 if (nzcnt == 1)
40267 gcc_assert (d->one_operand_p);
40268 nonzero_halves[1] = nonzero_halves[0];
40269 same_halves = true;
40271 else if (d->one_operand_p)
40273 gcc_assert (nonzero_halves[0] == 0);
40274 gcc_assert (nonzero_halves[1] == 1);
40277 if (nzcnt <= 2)
40279 if (d->perm[0] / nelt2 == nonzero_halves[1])
40281 /* Attempt to increase the likelihood that dfinal
40282 shuffle will be intra-lane. */
40283 char tmph = nonzero_halves[0];
40284 nonzero_halves[0] = nonzero_halves[1];
40285 nonzero_halves[1] = tmph;
40288 /* vperm2f128 or vperm2i128. */
40289 for (i = 0; i < nelt2; ++i)
40291 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40292 remap[i + nonzero_halves[0] * nelt2] = i;
40293 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40294 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40297 if (d->vmode != V8SFmode
40298 && d->vmode != V4DFmode
40299 && d->vmode != V8SImode)
40301 dremap.vmode = V8SImode;
40302 dremap.nelt = 8;
40303 for (i = 0; i < 4; ++i)
40305 dremap.perm[i] = i + nonzero_halves[0] * 4;
40306 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40310 else if (d->one_operand_p)
40311 return false;
40312 else if (TARGET_AVX2
40313 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40315 /* vpunpckl* */
40316 for (i = 0; i < nelt4; ++i)
40318 remap[i] = i * 2;
40319 remap[i + nelt] = i * 2 + 1;
40320 remap[i + nelt2] = i * 2 + nelt2;
40321 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40322 dremap.perm[i * 2] = i;
40323 dremap.perm[i * 2 + 1] = i + nelt;
40324 dremap.perm[i * 2 + nelt2] = i + nelt2;
40325 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40328 else if (TARGET_AVX2
40329 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40331 /* vpunpckh* */
40332 for (i = 0; i < nelt4; ++i)
40334 remap[i + nelt4] = i * 2;
40335 remap[i + nelt + nelt4] = i * 2 + 1;
40336 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40337 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40338 dremap.perm[i * 2] = i + nelt4;
40339 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40340 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40341 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40344 else
40345 return false;
40348 /* Use the remapping array set up above to move the elements from their
40349 swizzled locations into their final destinations. */
40350 dfinal = *d;
40351 for (i = 0; i < nelt; ++i)
40353 unsigned e = remap[d->perm[i]];
40354 gcc_assert (e < nelt);
40355 /* If same_halves is true, both halves of the remapped vector are the
40356 same. Avoid cross-lane accesses if possible. */
40357 if (same_halves && i >= nelt2)
40359 gcc_assert (e < nelt2);
40360 dfinal.perm[i] = e + nelt2;
40362 else
40363 dfinal.perm[i] = e;
40365 dremap.target = gen_reg_rtx (dremap.vmode);
40366 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40367 dfinal.op1 = dfinal.op0;
40368 dfinal.one_operand_p = true;
40370 /* Test if the final remap can be done with a single insn. For V4SFmode or
40371 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40372 start_sequence ();
40373 ok = expand_vec_perm_1 (&dfinal);
40374 seq = get_insns ();
40375 end_sequence ();
40377 if (!ok)
40378 return false;
40380 if (d->testing_p)
40381 return true;
40383 if (dremap.vmode != dfinal.vmode)
40385 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40386 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40389 ok = expand_vec_perm_1 (&dremap);
40390 gcc_assert (ok);
40392 emit_insn (seq);
40393 return true;
40396 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40397 a single vector cross-lane permutation into vpermq followed
40398 by any of the single insn permutations. */
40400 static bool
40401 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40403 struct expand_vec_perm_d dremap, dfinal;
40404 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40405 unsigned contents[2];
40406 bool ok;
40408 if (!(TARGET_AVX2
40409 && (d->vmode == V32QImode || d->vmode == V16HImode)
40410 && d->one_operand_p))
40411 return false;
40413 contents[0] = 0;
40414 contents[1] = 0;
40415 for (i = 0; i < nelt2; ++i)
40417 contents[0] |= 1u << (d->perm[i] / nelt4);
40418 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40421 for (i = 0; i < 2; ++i)
40423 unsigned int cnt = 0;
40424 for (j = 0; j < 4; ++j)
40425 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40426 return false;
40429 if (d->testing_p)
40430 return true;
40432 dremap = *d;
40433 dremap.vmode = V4DImode;
40434 dremap.nelt = 4;
40435 dremap.target = gen_reg_rtx (V4DImode);
40436 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40437 dremap.op1 = dremap.op0;
40438 dremap.one_operand_p = true;
40439 for (i = 0; i < 2; ++i)
40441 unsigned int cnt = 0;
40442 for (j = 0; j < 4; ++j)
40443 if ((contents[i] & (1u << j)) != 0)
40444 dremap.perm[2 * i + cnt++] = j;
40445 for (; cnt < 2; ++cnt)
40446 dremap.perm[2 * i + cnt] = 0;
40449 dfinal = *d;
40450 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40451 dfinal.op1 = dfinal.op0;
40452 dfinal.one_operand_p = true;
40453 for (i = 0, j = 0; i < nelt; ++i)
40455 if (i == nelt2)
40456 j = 2;
40457 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40458 if ((d->perm[i] / nelt4) == dremap.perm[j])
40460 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40461 dfinal.perm[i] |= nelt4;
40462 else
40463 gcc_unreachable ();
40466 ok = expand_vec_perm_1 (&dremap);
40467 gcc_assert (ok);
40469 ok = expand_vec_perm_1 (&dfinal);
40470 gcc_assert (ok);
40472 return true;
40475 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40476 a vector permutation using two instructions, vperm2f128 resp.
40477 vperm2i128 followed by any single in-lane permutation. */
40479 static bool
40480 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40482 struct expand_vec_perm_d dfirst, dsecond;
40483 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40484 bool ok;
40486 if (!TARGET_AVX
40487 || GET_MODE_SIZE (d->vmode) != 32
40488 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40489 return false;
40491 dsecond = *d;
40492 dsecond.one_operand_p = false;
40493 dsecond.testing_p = true;
40495 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40496 immediate. For perm < 16 the second permutation uses
40497 d->op0 as first operand, for perm >= 16 it uses d->op1
40498 as first operand. The second operand is the result of
40499 vperm2[fi]128. */
40500 for (perm = 0; perm < 32; perm++)
40502 /* Ignore permutations which do not move anything cross-lane. */
40503 if (perm < 16)
40505 /* The second shuffle for e.g. V4DFmode has
40506 0123 and ABCD operands.
40507 Ignore AB23, as 23 is already in the second lane
40508 of the first operand. */
40509 if ((perm & 0xc) == (1 << 2)) continue;
40510 /* And 01CD, as 01 is in the first lane of the first
40511 operand. */
40512 if ((perm & 3) == 0) continue;
40513 /* And 4567, as then the vperm2[fi]128 doesn't change
40514 anything on the original 4567 second operand. */
40515 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40517 else
40519 /* The second shuffle for e.g. V4DFmode has
40520 4567 and ABCD operands.
40521 Ignore AB67, as 67 is already in the second lane
40522 of the first operand. */
40523 if ((perm & 0xc) == (3 << 2)) continue;
40524 /* And 45CD, as 45 is in the first lane of the first
40525 operand. */
40526 if ((perm & 3) == 2) continue;
40527 /* And 0123, as then the vperm2[fi]128 doesn't change
40528 anything on the original 0123 first operand. */
40529 if ((perm & 0xf) == (1 << 2)) continue;
40532 for (i = 0; i < nelt; i++)
40534 j = d->perm[i] / nelt2;
40535 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40536 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40537 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40538 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40539 else
40540 break;
40543 if (i == nelt)
40545 start_sequence ();
40546 ok = expand_vec_perm_1 (&dsecond);
40547 end_sequence ();
40549 else
40550 ok = false;
40552 if (ok)
40554 if (d->testing_p)
40555 return true;
40557 /* Found a usable second shuffle. dfirst will be
40558 vperm2f128 on d->op0 and d->op1. */
40559 dsecond.testing_p = false;
40560 dfirst = *d;
40561 dfirst.target = gen_reg_rtx (d->vmode);
40562 for (i = 0; i < nelt; i++)
40563 dfirst.perm[i] = (i & (nelt2 - 1))
40564 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40566 ok = expand_vec_perm_1 (&dfirst);
40567 gcc_assert (ok);
40569 /* And dsecond is some single insn shuffle, taking
40570 d->op0 and result of vperm2f128 (if perm < 16) or
40571 d->op1 and result of vperm2f128 (otherwise). */
40572 dsecond.op1 = dfirst.target;
40573 if (perm >= 16)
40574 dsecond.op0 = dfirst.op1;
40576 ok = expand_vec_perm_1 (&dsecond);
40577 gcc_assert (ok);
40579 return true;
40582 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40583 if (d->one_operand_p)
40584 return false;
40587 return false;
40590 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40591 a two vector permutation using 2 intra-lane interleave insns
40592 and cross-lane shuffle for 32-byte vectors. */
40594 static bool
40595 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40597 unsigned i, nelt;
40598 rtx (*gen) (rtx, rtx, rtx);
40600 if (d->one_operand_p)
40601 return false;
40602 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40604 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40606 else
40607 return false;
40609 nelt = d->nelt;
40610 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40611 return false;
40612 for (i = 0; i < nelt; i += 2)
40613 if (d->perm[i] != d->perm[0] + i / 2
40614 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40615 return false;
40617 if (d->testing_p)
40618 return true;
40620 switch (d->vmode)
40622 case V32QImode:
40623 if (d->perm[0])
40624 gen = gen_vec_interleave_highv32qi;
40625 else
40626 gen = gen_vec_interleave_lowv32qi;
40627 break;
40628 case V16HImode:
40629 if (d->perm[0])
40630 gen = gen_vec_interleave_highv16hi;
40631 else
40632 gen = gen_vec_interleave_lowv16hi;
40633 break;
40634 case V8SImode:
40635 if (d->perm[0])
40636 gen = gen_vec_interleave_highv8si;
40637 else
40638 gen = gen_vec_interleave_lowv8si;
40639 break;
40640 case V4DImode:
40641 if (d->perm[0])
40642 gen = gen_vec_interleave_highv4di;
40643 else
40644 gen = gen_vec_interleave_lowv4di;
40645 break;
40646 case V8SFmode:
40647 if (d->perm[0])
40648 gen = gen_vec_interleave_highv8sf;
40649 else
40650 gen = gen_vec_interleave_lowv8sf;
40651 break;
40652 case V4DFmode:
40653 if (d->perm[0])
40654 gen = gen_vec_interleave_highv4df;
40655 else
40656 gen = gen_vec_interleave_lowv4df;
40657 break;
40658 default:
40659 gcc_unreachable ();
40662 emit_insn (gen (d->target, d->op0, d->op1));
40663 return true;
40666 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40667 a single vector permutation using a single intra-lane vector
40668 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40669 the non-swapped and swapped vectors together. */
40671 static bool
40672 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40674 struct expand_vec_perm_d dfirst, dsecond;
40675 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40676 rtx seq;
40677 bool ok;
40678 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40680 if (!TARGET_AVX
40681 || TARGET_AVX2
40682 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40683 || !d->one_operand_p)
40684 return false;
40686 dfirst = *d;
40687 for (i = 0; i < nelt; i++)
40688 dfirst.perm[i] = 0xff;
40689 for (i = 0, msk = 0; i < nelt; i++)
40691 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40692 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40693 return false;
40694 dfirst.perm[j] = d->perm[i];
40695 if (j != i)
40696 msk |= (1 << i);
40698 for (i = 0; i < nelt; i++)
40699 if (dfirst.perm[i] == 0xff)
40700 dfirst.perm[i] = i;
40702 if (!d->testing_p)
40703 dfirst.target = gen_reg_rtx (dfirst.vmode);
40705 start_sequence ();
40706 ok = expand_vec_perm_1 (&dfirst);
40707 seq = get_insns ();
40708 end_sequence ();
40710 if (!ok)
40711 return false;
40713 if (d->testing_p)
40714 return true;
40716 emit_insn (seq);
40718 dsecond = *d;
40719 dsecond.op0 = dfirst.target;
40720 dsecond.op1 = dfirst.target;
40721 dsecond.one_operand_p = true;
40722 dsecond.target = gen_reg_rtx (dsecond.vmode);
40723 for (i = 0; i < nelt; i++)
40724 dsecond.perm[i] = i ^ nelt2;
40726 ok = expand_vec_perm_1 (&dsecond);
40727 gcc_assert (ok);
40729 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40730 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40731 return true;
40734 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40735 permutation using two vperm2f128, followed by a vshufpd insn blending
40736 the two vectors together. */
40738 static bool
40739 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40741 struct expand_vec_perm_d dfirst, dsecond, dthird;
40742 bool ok;
40744 if (!TARGET_AVX || (d->vmode != V4DFmode))
40745 return false;
40747 if (d->testing_p)
40748 return true;
40750 dfirst = *d;
40751 dsecond = *d;
40752 dthird = *d;
40754 dfirst.perm[0] = (d->perm[0] & ~1);
40755 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40756 dfirst.perm[2] = (d->perm[2] & ~1);
40757 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40758 dsecond.perm[0] = (d->perm[1] & ~1);
40759 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40760 dsecond.perm[2] = (d->perm[3] & ~1);
40761 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40762 dthird.perm[0] = (d->perm[0] % 2);
40763 dthird.perm[1] = (d->perm[1] % 2) + 4;
40764 dthird.perm[2] = (d->perm[2] % 2) + 2;
40765 dthird.perm[3] = (d->perm[3] % 2) + 6;
40767 dfirst.target = gen_reg_rtx (dfirst.vmode);
40768 dsecond.target = gen_reg_rtx (dsecond.vmode);
40769 dthird.op0 = dfirst.target;
40770 dthird.op1 = dsecond.target;
40771 dthird.one_operand_p = false;
40773 canonicalize_perm (&dfirst);
40774 canonicalize_perm (&dsecond);
40776 ok = expand_vec_perm_1 (&dfirst)
40777 && expand_vec_perm_1 (&dsecond)
40778 && expand_vec_perm_1 (&dthird);
40780 gcc_assert (ok);
40782 return true;
40785 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40786 permutation with two pshufb insns and an ior. We should have already
40787 failed all two instruction sequences. */
40789 static bool
40790 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40792 rtx rperm[2][16], vperm, l, h, op, m128;
40793 unsigned int i, nelt, eltsz;
40795 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40796 return false;
40797 gcc_assert (!d->one_operand_p);
40799 nelt = d->nelt;
40800 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40802 /* Generate two permutation masks. If the required element is within
40803 the given vector it is shuffled into the proper lane. If the required
40804 element is in the other vector, force a zero into the lane by setting
40805 bit 7 in the permutation mask. */
40806 m128 = GEN_INT (-128);
40807 for (i = 0; i < nelt; ++i)
40809 unsigned j, e = d->perm[i];
40810 unsigned which = (e >= nelt);
40811 if (e >= nelt)
40812 e -= nelt;
40814 for (j = 0; j < eltsz; ++j)
40816 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
40817 rperm[1-which][i*eltsz + j] = m128;
40821 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40822 vperm = force_reg (V16QImode, vperm);
40824 l = gen_reg_rtx (V16QImode);
40825 op = gen_lowpart (V16QImode, d->op0);
40826 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40828 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40829 vperm = force_reg (V16QImode, vperm);
40831 h = gen_reg_rtx (V16QImode);
40832 op = gen_lowpart (V16QImode, d->op1);
40833 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40835 op = d->target;
40836 if (d->vmode != V16QImode)
40837 op = gen_reg_rtx (V16QImode);
40838 emit_insn (gen_iorv16qi3 (op, l, h));
40839 if (op != d->target)
40840 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
40842 return true;
40845 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40846 with two vpshufb insns, vpermq and vpor. We should have already failed
40847 all two or three instruction sequences. */
40849 static bool
40850 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40852 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40853 unsigned int i, nelt, eltsz;
40855 if (!TARGET_AVX2
40856 || !d->one_operand_p
40857 || (d->vmode != V32QImode && d->vmode != V16HImode))
40858 return false;
40860 if (d->testing_p)
40861 return true;
40863 nelt = d->nelt;
40864 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40866 /* Generate two permutation masks. If the required element is within
40867 the same lane, it is shuffled in. If the required element from the
40868 other lane, force a zero by setting bit 7 in the permutation mask.
40869 In the other mask the mask has non-negative elements if element
40870 is requested from the other lane, but also moved to the other lane,
40871 so that the result of vpshufb can have the two V2TImode halves
40872 swapped. */
40873 m128 = GEN_INT (-128);
40874 for (i = 0; i < nelt; ++i)
40876 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40877 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40879 for (j = 0; j < eltsz; ++j)
40881 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
40882 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
40886 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40887 vperm = force_reg (V32QImode, vperm);
40889 h = gen_reg_rtx (V32QImode);
40890 op = gen_lowpart (V32QImode, d->op0);
40891 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40893 /* Swap the 128-byte lanes of h into hp. */
40894 hp = gen_reg_rtx (V4DImode);
40895 op = gen_lowpart (V4DImode, h);
40896 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
40897 const1_rtx));
40899 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40900 vperm = force_reg (V32QImode, vperm);
40902 l = gen_reg_rtx (V32QImode);
40903 op = gen_lowpart (V32QImode, d->op0);
40904 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40906 op = d->target;
40907 if (d->vmode != V32QImode)
40908 op = gen_reg_rtx (V32QImode);
40909 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
40910 if (op != d->target)
40911 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
40913 return true;
40916 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
40917 and extract-odd permutations of two V32QImode and V16QImode operand
40918 with two vpshufb insns, vpor and vpermq. We should have already
40919 failed all two or three instruction sequences. */
40921 static bool
40922 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
40924 rtx rperm[2][32], vperm, l, h, ior, op, m128;
40925 unsigned int i, nelt, eltsz;
40927 if (!TARGET_AVX2
40928 || d->one_operand_p
40929 || (d->vmode != V32QImode && d->vmode != V16HImode))
40930 return false;
40932 for (i = 0; i < d->nelt; ++i)
40933 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
40934 return false;
40936 if (d->testing_p)
40937 return true;
40939 nelt = d->nelt;
40940 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40942 /* Generate two permutation masks. In the first permutation mask
40943 the first quarter will contain indexes for the first half
40944 of the op0, the second quarter will contain bit 7 set, third quarter
40945 will contain indexes for the second half of the op0 and the
40946 last quarter bit 7 set. In the second permutation mask
40947 the first quarter will contain bit 7 set, the second quarter
40948 indexes for the first half of the op1, the third quarter bit 7 set
40949 and last quarter indexes for the second half of the op1.
40950 I.e. the first mask e.g. for V32QImode extract even will be:
40951 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40952 (all values masked with 0xf except for -128) and second mask
40953 for extract even will be
40954 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40955 m128 = GEN_INT (-128);
40956 for (i = 0; i < nelt; ++i)
40958 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40959 unsigned which = d->perm[i] >= nelt;
40960 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40962 for (j = 0; j < eltsz; ++j)
40964 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40965 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40969 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40970 vperm = force_reg (V32QImode, vperm);
40972 l = gen_reg_rtx (V32QImode);
40973 op = gen_lowpart (V32QImode, d->op0);
40974 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40976 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40977 vperm = force_reg (V32QImode, vperm);
40979 h = gen_reg_rtx (V32QImode);
40980 op = gen_lowpart (V32QImode, d->op1);
40981 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40983 ior = gen_reg_rtx (V32QImode);
40984 emit_insn (gen_iorv32qi3 (ior, l, h));
40986 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40987 op = gen_reg_rtx (V4DImode);
40988 ior = gen_lowpart (V4DImode, ior);
40989 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40990 const1_rtx, GEN_INT (3)));
40991 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
40993 return true;
40996 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40997 and extract-odd permutations. */
40999 static bool
41000 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41002 rtx t1, t2, t3, t4, t5;
41004 switch (d->vmode)
41006 case V4DFmode:
41007 t1 = gen_reg_rtx (V4DFmode);
41008 t2 = gen_reg_rtx (V4DFmode);
41010 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41011 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41012 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41014 /* Now an unpck[lh]pd will produce the result required. */
41015 if (odd)
41016 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41017 else
41018 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41019 emit_insn (t3);
41020 break;
41022 case V8SFmode:
41024 int mask = odd ? 0xdd : 0x88;
41026 t1 = gen_reg_rtx (V8SFmode);
41027 t2 = gen_reg_rtx (V8SFmode);
41028 t3 = gen_reg_rtx (V8SFmode);
41030 /* Shuffle within the 128-bit lanes to produce:
41031 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41032 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41033 GEN_INT (mask)));
41035 /* Shuffle the lanes around to produce:
41036 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41037 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41038 GEN_INT (0x3)));
41040 /* Shuffle within the 128-bit lanes to produce:
41041 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41042 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41044 /* Shuffle within the 128-bit lanes to produce:
41045 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41046 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41048 /* Shuffle the lanes around to produce:
41049 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41050 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41051 GEN_INT (0x20)));
41053 break;
41055 case V2DFmode:
41056 case V4SFmode:
41057 case V2DImode:
41058 case V4SImode:
41059 /* These are always directly implementable by expand_vec_perm_1. */
41060 gcc_unreachable ();
41062 case V8HImode:
41063 if (TARGET_SSSE3)
41064 return expand_vec_perm_pshufb2 (d);
41065 else
41067 /* We need 2*log2(N)-1 operations to achieve odd/even
41068 with interleave. */
41069 t1 = gen_reg_rtx (V8HImode);
41070 t2 = gen_reg_rtx (V8HImode);
41071 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41072 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41073 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41074 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41075 if (odd)
41076 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41077 else
41078 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41079 emit_insn (t3);
41081 break;
41083 case V16QImode:
41084 if (TARGET_SSSE3)
41085 return expand_vec_perm_pshufb2 (d);
41086 else
41088 t1 = gen_reg_rtx (V16QImode);
41089 t2 = gen_reg_rtx (V16QImode);
41090 t3 = gen_reg_rtx (V16QImode);
41091 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41092 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41093 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41094 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41095 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41096 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41097 if (odd)
41098 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41099 else
41100 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41101 emit_insn (t3);
41103 break;
41105 case V16HImode:
41106 case V32QImode:
41107 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41109 case V4DImode:
41110 if (!TARGET_AVX2)
41112 struct expand_vec_perm_d d_copy = *d;
41113 d_copy.vmode = V4DFmode;
41114 d_copy.target = gen_reg_rtx (V4DFmode);
41115 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41116 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41117 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41119 if (!d->testing_p)
41120 emit_move_insn (d->target,
41121 gen_lowpart (V4DImode, d_copy.target));
41122 return true;
41124 return false;
41127 t1 = gen_reg_rtx (V4DImode);
41128 t2 = gen_reg_rtx (V4DImode);
41130 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41131 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41132 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41134 /* Now an vpunpck[lh]qdq will produce the result required. */
41135 if (odd)
41136 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41137 else
41138 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41139 emit_insn (t3);
41140 break;
41142 case V8SImode:
41143 if (!TARGET_AVX2)
41145 struct expand_vec_perm_d d_copy = *d;
41146 d_copy.vmode = V8SFmode;
41147 d_copy.target = gen_reg_rtx (V8SFmode);
41148 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41149 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41150 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41152 if (!d->testing_p)
41153 emit_move_insn (d->target,
41154 gen_lowpart (V8SImode, d_copy.target));
41155 return true;
41157 return false;
41160 t1 = gen_reg_rtx (V8SImode);
41161 t2 = gen_reg_rtx (V8SImode);
41162 t3 = gen_reg_rtx (V4DImode);
41163 t4 = gen_reg_rtx (V4DImode);
41164 t5 = gen_reg_rtx (V4DImode);
41166 /* Shuffle the lanes around into
41167 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41168 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41169 gen_lowpart (V4DImode, d->op1),
41170 GEN_INT (0x20)));
41171 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41172 gen_lowpart (V4DImode, d->op1),
41173 GEN_INT (0x31)));
41175 /* Swap the 2nd and 3rd position in each lane into
41176 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41177 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41178 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41179 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41180 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41182 /* Now an vpunpck[lh]qdq will produce
41183 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41184 if (odd)
41185 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41186 gen_lowpart (V4DImode, t2));
41187 else
41188 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41189 gen_lowpart (V4DImode, t2));
41190 emit_insn (t3);
41191 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41192 break;
41194 default:
41195 gcc_unreachable ();
41198 return true;
41201 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41202 extract-even and extract-odd permutations. */
41204 static bool
41205 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41207 unsigned i, odd, nelt = d->nelt;
41209 odd = d->perm[0];
41210 if (odd != 0 && odd != 1)
41211 return false;
41213 for (i = 1; i < nelt; ++i)
41214 if (d->perm[i] != 2 * i + odd)
41215 return false;
41217 return expand_vec_perm_even_odd_1 (d, odd);
41220 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41221 permutations. We assume that expand_vec_perm_1 has already failed. */
41223 static bool
41224 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41226 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41227 enum machine_mode vmode = d->vmode;
41228 unsigned char perm2[4];
41229 rtx op0 = d->op0, dest;
41230 bool ok;
41232 switch (vmode)
41234 case V4DFmode:
41235 case V8SFmode:
41236 /* These are special-cased in sse.md so that we can optionally
41237 use the vbroadcast instruction. They expand to two insns
41238 if the input happens to be in a register. */
41239 gcc_unreachable ();
41241 case V2DFmode:
41242 case V2DImode:
41243 case V4SFmode:
41244 case V4SImode:
41245 /* These are always implementable using standard shuffle patterns. */
41246 gcc_unreachable ();
41248 case V8HImode:
41249 case V16QImode:
41250 /* These can be implemented via interleave. We save one insn by
41251 stopping once we have promoted to V4SImode and then use pshufd. */
41254 rtx dest;
41255 rtx (*gen) (rtx, rtx, rtx)
41256 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41257 : gen_vec_interleave_lowv8hi;
41259 if (elt >= nelt2)
41261 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41262 : gen_vec_interleave_highv8hi;
41263 elt -= nelt2;
41265 nelt2 /= 2;
41267 dest = gen_reg_rtx (vmode);
41268 emit_insn (gen (dest, op0, op0));
41269 vmode = get_mode_wider_vector (vmode);
41270 op0 = gen_lowpart (vmode, dest);
41272 while (vmode != V4SImode);
41274 memset (perm2, elt, 4);
41275 dest = gen_reg_rtx (V4SImode);
41276 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41277 gcc_assert (ok);
41278 if (!d->testing_p)
41279 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41280 return true;
41282 case V32QImode:
41283 case V16HImode:
41284 case V8SImode:
41285 case V4DImode:
41286 /* For AVX2 broadcasts of the first element vpbroadcast* or
41287 vpermq should be used by expand_vec_perm_1. */
41288 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41289 return false;
41291 default:
41292 gcc_unreachable ();
41296 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41297 broadcast permutations. */
41299 static bool
41300 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41302 unsigned i, elt, nelt = d->nelt;
41304 if (!d->one_operand_p)
41305 return false;
41307 elt = d->perm[0];
41308 for (i = 1; i < nelt; ++i)
41309 if (d->perm[i] != elt)
41310 return false;
41312 return expand_vec_perm_broadcast_1 (d);
41315 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41316 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41317 all the shorter instruction sequences. */
41319 static bool
41320 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41322 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41323 unsigned int i, nelt, eltsz;
41324 bool used[4];
41326 if (!TARGET_AVX2
41327 || d->one_operand_p
41328 || (d->vmode != V32QImode && d->vmode != V16HImode))
41329 return false;
41331 if (d->testing_p)
41332 return true;
41334 nelt = d->nelt;
41335 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41337 /* Generate 4 permutation masks. If the required element is within
41338 the same lane, it is shuffled in. If the required element from the
41339 other lane, force a zero by setting bit 7 in the permutation mask.
41340 In the other mask the mask has non-negative elements if element
41341 is requested from the other lane, but also moved to the other lane,
41342 so that the result of vpshufb can have the two V2TImode halves
41343 swapped. */
41344 m128 = GEN_INT (-128);
41345 for (i = 0; i < 32; ++i)
41347 rperm[0][i] = m128;
41348 rperm[1][i] = m128;
41349 rperm[2][i] = m128;
41350 rperm[3][i] = m128;
41352 used[0] = false;
41353 used[1] = false;
41354 used[2] = false;
41355 used[3] = false;
41356 for (i = 0; i < nelt; ++i)
41358 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41359 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41360 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41362 for (j = 0; j < eltsz; ++j)
41363 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41364 used[which] = true;
41367 for (i = 0; i < 2; ++i)
41369 if (!used[2 * i + 1])
41371 h[i] = NULL_RTX;
41372 continue;
41374 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41375 gen_rtvec_v (32, rperm[2 * i + 1]));
41376 vperm = force_reg (V32QImode, vperm);
41377 h[i] = gen_reg_rtx (V32QImode);
41378 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41379 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41382 /* Swap the 128-byte lanes of h[X]. */
41383 for (i = 0; i < 2; ++i)
41385 if (h[i] == NULL_RTX)
41386 continue;
41387 op = gen_reg_rtx (V4DImode);
41388 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41389 const2_rtx, GEN_INT (3), const0_rtx,
41390 const1_rtx));
41391 h[i] = gen_lowpart (V32QImode, op);
41394 for (i = 0; i < 2; ++i)
41396 if (!used[2 * i])
41398 l[i] = NULL_RTX;
41399 continue;
41401 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41402 vperm = force_reg (V32QImode, vperm);
41403 l[i] = gen_reg_rtx (V32QImode);
41404 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41405 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41408 for (i = 0; i < 2; ++i)
41410 if (h[i] && l[i])
41412 op = gen_reg_rtx (V32QImode);
41413 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41414 l[i] = op;
41416 else if (h[i])
41417 l[i] = h[i];
41420 gcc_assert (l[0] && l[1]);
41421 op = d->target;
41422 if (d->vmode != V32QImode)
41423 op = gen_reg_rtx (V32QImode);
41424 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41425 if (op != d->target)
41426 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41427 return true;
41430 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41431 With all of the interface bits taken care of, perform the expansion
41432 in D and return true on success. */
41434 static bool
41435 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41437 /* Try a single instruction expansion. */
41438 if (expand_vec_perm_1 (d))
41439 return true;
41441 /* Try sequences of two instructions. */
41443 if (expand_vec_perm_pshuflw_pshufhw (d))
41444 return true;
41446 if (expand_vec_perm_palignr (d))
41447 return true;
41449 if (expand_vec_perm_interleave2 (d))
41450 return true;
41452 if (expand_vec_perm_broadcast (d))
41453 return true;
41455 if (expand_vec_perm_vpermq_perm_1 (d))
41456 return true;
41458 if (expand_vec_perm_vperm2f128 (d))
41459 return true;
41461 /* Try sequences of three instructions. */
41463 if (expand_vec_perm_2vperm2f128_vshuf (d))
41464 return true;
41466 if (expand_vec_perm_pshufb2 (d))
41467 return true;
41469 if (expand_vec_perm_interleave3 (d))
41470 return true;
41472 if (expand_vec_perm_vperm2f128_vblend (d))
41473 return true;
41475 /* Try sequences of four instructions. */
41477 if (expand_vec_perm_vpshufb2_vpermq (d))
41478 return true;
41480 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41481 return true;
41483 /* ??? Look for narrow permutations whose element orderings would
41484 allow the promotion to a wider mode. */
41486 /* ??? Look for sequences of interleave or a wider permute that place
41487 the data into the correct lanes for a half-vector shuffle like
41488 pshuf[lh]w or vpermilps. */
41490 /* ??? Look for sequences of interleave that produce the desired results.
41491 The combinatorics of punpck[lh] get pretty ugly... */
41493 if (expand_vec_perm_even_odd (d))
41494 return true;
41496 /* Even longer sequences. */
41497 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41498 return true;
41500 return false;
41503 /* If a permutation only uses one operand, make it clear. Returns true
41504 if the permutation references both operands. */
41506 static bool
41507 canonicalize_perm (struct expand_vec_perm_d *d)
41509 int i, which, nelt = d->nelt;
41511 for (i = which = 0; i < nelt; ++i)
41512 which |= (d->perm[i] < nelt ? 1 : 2);
41514 d->one_operand_p = true;
41515 switch (which)
41517 default:
41518 gcc_unreachable();
41520 case 3:
41521 if (!rtx_equal_p (d->op0, d->op1))
41523 d->one_operand_p = false;
41524 break;
41526 /* The elements of PERM do not suggest that only the first operand
41527 is used, but both operands are identical. Allow easier matching
41528 of the permutation by folding the permutation into the single
41529 input vector. */
41530 /* FALLTHRU */
41532 case 2:
41533 for (i = 0; i < nelt; ++i)
41534 d->perm[i] &= nelt - 1;
41535 d->op0 = d->op1;
41536 break;
41538 case 1:
41539 d->op1 = d->op0;
41540 break;
41543 return (which == 3);
41546 bool
41547 ix86_expand_vec_perm_const (rtx operands[4])
41549 struct expand_vec_perm_d d;
41550 unsigned char perm[MAX_VECT_LEN];
41551 int i, nelt;
41552 bool two_args;
41553 rtx sel;
41555 d.target = operands[0];
41556 d.op0 = operands[1];
41557 d.op1 = operands[2];
41558 sel = operands[3];
41560 d.vmode = GET_MODE (d.target);
41561 gcc_assert (VECTOR_MODE_P (d.vmode));
41562 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41563 d.testing_p = false;
41565 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41566 gcc_assert (XVECLEN (sel, 0) == nelt);
41567 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41569 for (i = 0; i < nelt; ++i)
41571 rtx e = XVECEXP (sel, 0, i);
41572 int ei = INTVAL (e) & (2 * nelt - 1);
41573 d.perm[i] = ei;
41574 perm[i] = ei;
41577 two_args = canonicalize_perm (&d);
41579 if (ix86_expand_vec_perm_const_1 (&d))
41580 return true;
41582 /* If the selector says both arguments are needed, but the operands are the
41583 same, the above tried to expand with one_operand_p and flattened selector.
41584 If that didn't work, retry without one_operand_p; we succeeded with that
41585 during testing. */
41586 if (two_args && d.one_operand_p)
41588 d.one_operand_p = false;
41589 memcpy (d.perm, perm, sizeof (perm));
41590 return ix86_expand_vec_perm_const_1 (&d);
41593 return false;
41596 /* Implement targetm.vectorize.vec_perm_const_ok. */
41598 static bool
41599 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41600 const unsigned char *sel)
41602 struct expand_vec_perm_d d;
41603 unsigned int i, nelt, which;
41604 bool ret;
41606 d.vmode = vmode;
41607 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41608 d.testing_p = true;
41610 /* Given sufficient ISA support we can just return true here
41611 for selected vector modes. */
41612 if (GET_MODE_SIZE (d.vmode) == 16)
41614 /* All implementable with a single vpperm insn. */
41615 if (TARGET_XOP)
41616 return true;
41617 /* All implementable with 2 pshufb + 1 ior. */
41618 if (TARGET_SSSE3)
41619 return true;
41620 /* All implementable with shufpd or unpck[lh]pd. */
41621 if (d.nelt == 2)
41622 return true;
41625 /* Extract the values from the vector CST into the permutation
41626 array in D. */
41627 memcpy (d.perm, sel, nelt);
41628 for (i = which = 0; i < nelt; ++i)
41630 unsigned char e = d.perm[i];
41631 gcc_assert (e < 2 * nelt);
41632 which |= (e < nelt ? 1 : 2);
41635 /* For all elements from second vector, fold the elements to first. */
41636 if (which == 2)
41637 for (i = 0; i < nelt; ++i)
41638 d.perm[i] -= nelt;
41640 /* Check whether the mask can be applied to the vector type. */
41641 d.one_operand_p = (which != 3);
41643 /* Implementable with shufps or pshufd. */
41644 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41645 return true;
41647 /* Otherwise we have to go through the motions and see if we can
41648 figure out how to generate the requested permutation. */
41649 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41650 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41651 if (!d.one_operand_p)
41652 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41654 start_sequence ();
41655 ret = ix86_expand_vec_perm_const_1 (&d);
41656 end_sequence ();
41658 return ret;
41661 void
41662 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41664 struct expand_vec_perm_d d;
41665 unsigned i, nelt;
41667 d.target = targ;
41668 d.op0 = op0;
41669 d.op1 = op1;
41670 d.vmode = GET_MODE (targ);
41671 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41672 d.one_operand_p = false;
41673 d.testing_p = false;
41675 for (i = 0; i < nelt; ++i)
41676 d.perm[i] = i * 2 + odd;
41678 /* We'll either be able to implement the permutation directly... */
41679 if (expand_vec_perm_1 (&d))
41680 return;
41682 /* ... or we use the special-case patterns. */
41683 expand_vec_perm_even_odd_1 (&d, odd);
41686 static void
41687 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41689 struct expand_vec_perm_d d;
41690 unsigned i, nelt, base;
41691 bool ok;
41693 d.target = targ;
41694 d.op0 = op0;
41695 d.op1 = op1;
41696 d.vmode = GET_MODE (targ);
41697 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41698 d.one_operand_p = false;
41699 d.testing_p = false;
41701 base = high_p ? nelt / 2 : 0;
41702 for (i = 0; i < nelt / 2; ++i)
41704 d.perm[i * 2] = i + base;
41705 d.perm[i * 2 + 1] = i + base + nelt;
41708 /* Note that for AVX this isn't one instruction. */
41709 ok = ix86_expand_vec_perm_const_1 (&d);
41710 gcc_assert (ok);
41714 /* Expand a vector operation CODE for a V*QImode in terms of the
41715 same operation on V*HImode. */
41717 void
41718 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41720 enum machine_mode qimode = GET_MODE (dest);
41721 enum machine_mode himode;
41722 rtx (*gen_il) (rtx, rtx, rtx);
41723 rtx (*gen_ih) (rtx, rtx, rtx);
41724 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41725 struct expand_vec_perm_d d;
41726 bool ok, full_interleave;
41727 bool uns_p = false;
41728 int i;
41730 switch (qimode)
41732 case V16QImode:
41733 himode = V8HImode;
41734 gen_il = gen_vec_interleave_lowv16qi;
41735 gen_ih = gen_vec_interleave_highv16qi;
41736 break;
41737 case V32QImode:
41738 himode = V16HImode;
41739 gen_il = gen_avx2_interleave_lowv32qi;
41740 gen_ih = gen_avx2_interleave_highv32qi;
41741 break;
41742 default:
41743 gcc_unreachable ();
41746 op2_l = op2_h = op2;
41747 switch (code)
41749 case MULT:
41750 /* Unpack data such that we've got a source byte in each low byte of
41751 each word. We don't care what goes into the high byte of each word.
41752 Rather than trying to get zero in there, most convenient is to let
41753 it be a copy of the low byte. */
41754 op2_l = gen_reg_rtx (qimode);
41755 op2_h = gen_reg_rtx (qimode);
41756 emit_insn (gen_il (op2_l, op2, op2));
41757 emit_insn (gen_ih (op2_h, op2, op2));
41758 /* FALLTHRU */
41760 op1_l = gen_reg_rtx (qimode);
41761 op1_h = gen_reg_rtx (qimode);
41762 emit_insn (gen_il (op1_l, op1, op1));
41763 emit_insn (gen_ih (op1_h, op1, op1));
41764 full_interleave = qimode == V16QImode;
41765 break;
41767 case ASHIFT:
41768 case LSHIFTRT:
41769 uns_p = true;
41770 /* FALLTHRU */
41771 case ASHIFTRT:
41772 op1_l = gen_reg_rtx (himode);
41773 op1_h = gen_reg_rtx (himode);
41774 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41775 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41776 full_interleave = true;
41777 break;
41778 default:
41779 gcc_unreachable ();
41782 /* Perform the operation. */
41783 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41784 1, OPTAB_DIRECT);
41785 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41786 1, OPTAB_DIRECT);
41787 gcc_assert (res_l && res_h);
41789 /* Merge the data back into the right place. */
41790 d.target = dest;
41791 d.op0 = gen_lowpart (qimode, res_l);
41792 d.op1 = gen_lowpart (qimode, res_h);
41793 d.vmode = qimode;
41794 d.nelt = GET_MODE_NUNITS (qimode);
41795 d.one_operand_p = false;
41796 d.testing_p = false;
41798 if (full_interleave)
41800 /* For SSE2, we used an full interleave, so the desired
41801 results are in the even elements. */
41802 for (i = 0; i < 32; ++i)
41803 d.perm[i] = i * 2;
41805 else
41807 /* For AVX, the interleave used above was not cross-lane. So the
41808 extraction is evens but with the second and third quarter swapped.
41809 Happily, that is even one insn shorter than even extraction. */
41810 for (i = 0; i < 32; ++i)
41811 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
41814 ok = ix86_expand_vec_perm_const_1 (&d);
41815 gcc_assert (ok);
41817 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41818 gen_rtx_fmt_ee (code, qimode, op1, op2));
41821 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
41822 if op is CONST_VECTOR with all odd elements equal to their
41823 preceding element. */
41825 static bool
41826 const_vector_equal_evenodd_p (rtx op)
41828 enum machine_mode mode = GET_MODE (op);
41829 int i, nunits = GET_MODE_NUNITS (mode);
41830 if (GET_CODE (op) != CONST_VECTOR
41831 || nunits != CONST_VECTOR_NUNITS (op))
41832 return false;
41833 for (i = 0; i < nunits; i += 2)
41834 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
41835 return false;
41836 return true;
41839 void
41840 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
41841 bool uns_p, bool odd_p)
41843 enum machine_mode mode = GET_MODE (op1);
41844 enum machine_mode wmode = GET_MODE (dest);
41845 rtx x;
41846 rtx orig_op1 = op1, orig_op2 = op2;
41848 if (!nonimmediate_operand (op1, mode))
41849 op1 = force_reg (mode, op1);
41850 if (!nonimmediate_operand (op2, mode))
41851 op2 = force_reg (mode, op2);
41853 /* We only play even/odd games with vectors of SImode. */
41854 gcc_assert (mode == V4SImode || mode == V8SImode);
41856 /* If we're looking for the odd results, shift those members down to
41857 the even slots. For some cpus this is faster than a PSHUFD. */
41858 if (odd_p)
41860 /* For XOP use vpmacsdqh, but only for smult, as it is only
41861 signed. */
41862 if (TARGET_XOP && mode == V4SImode && !uns_p)
41864 x = force_reg (wmode, CONST0_RTX (wmode));
41865 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41866 return;
41869 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41870 if (!const_vector_equal_evenodd_p (orig_op1))
41871 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
41872 x, NULL, 1, OPTAB_DIRECT);
41873 if (!const_vector_equal_evenodd_p (orig_op2))
41874 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
41875 x, NULL, 1, OPTAB_DIRECT);
41876 op1 = gen_lowpart (mode, op1);
41877 op2 = gen_lowpart (mode, op2);
41880 if (mode == V8SImode)
41882 if (uns_p)
41883 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
41884 else
41885 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
41887 else if (uns_p)
41888 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
41889 else if (TARGET_SSE4_1)
41890 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
41891 else
41893 rtx s1, s2, t0, t1, t2;
41895 /* The easiest way to implement this without PMULDQ is to go through
41896 the motions as if we are performing a full 64-bit multiply. With
41897 the exception that we need to do less shuffling of the elements. */
41899 /* Compute the sign-extension, aka highparts, of the two operands. */
41900 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41901 op1, pc_rtx, pc_rtx);
41902 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
41903 op2, pc_rtx, pc_rtx);
41905 /* Multiply LO(A) * HI(B), and vice-versa. */
41906 t1 = gen_reg_rtx (wmode);
41907 t2 = gen_reg_rtx (wmode);
41908 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
41909 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
41911 /* Multiply LO(A) * LO(B). */
41912 t0 = gen_reg_rtx (wmode);
41913 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
41915 /* Combine and shift the highparts into place. */
41916 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
41917 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
41918 1, OPTAB_DIRECT);
41920 /* Combine high and low parts. */
41921 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
41922 return;
41924 emit_insn (x);
41927 void
41928 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
41929 bool uns_p, bool high_p)
41931 enum machine_mode wmode = GET_MODE (dest);
41932 enum machine_mode mode = GET_MODE (op1);
41933 rtx t1, t2, t3, t4, mask;
41935 switch (mode)
41937 case V4SImode:
41938 t1 = gen_reg_rtx (mode);
41939 t2 = gen_reg_rtx (mode);
41940 if (TARGET_XOP && !uns_p)
41942 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
41943 shuffle the elements once so that all elements are in the right
41944 place for immediate use: { A C B D }. */
41945 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
41946 const1_rtx, GEN_INT (3)));
41947 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
41948 const1_rtx, GEN_INT (3)));
41950 else
41952 /* Put the elements into place for the multiply. */
41953 ix86_expand_vec_interleave (t1, op1, op1, high_p);
41954 ix86_expand_vec_interleave (t2, op2, op2, high_p);
41955 high_p = false;
41957 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
41958 break;
41960 case V8SImode:
41961 /* Shuffle the elements between the lanes. After this we
41962 have { A B E F | C D G H } for each operand. */
41963 t1 = gen_reg_rtx (V4DImode);
41964 t2 = gen_reg_rtx (V4DImode);
41965 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
41966 const0_rtx, const2_rtx,
41967 const1_rtx, GEN_INT (3)));
41968 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
41969 const0_rtx, const2_rtx,
41970 const1_rtx, GEN_INT (3)));
41972 /* Shuffle the elements within the lanes. After this we
41973 have { A A B B | C C D D } or { E E F F | G G H H }. */
41974 t3 = gen_reg_rtx (V8SImode);
41975 t4 = gen_reg_rtx (V8SImode);
41976 mask = GEN_INT (high_p
41977 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41978 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41979 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41980 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41982 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41983 break;
41985 case V8HImode:
41986 case V16HImode:
41987 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41988 uns_p, OPTAB_DIRECT);
41989 t2 = expand_binop (mode,
41990 uns_p ? umul_highpart_optab : smul_highpart_optab,
41991 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41992 gcc_assert (t1 && t2);
41994 t3 = gen_reg_rtx (mode);
41995 ix86_expand_vec_interleave (t3, t1, t2, high_p);
41996 emit_move_insn (dest, gen_lowpart (wmode, t3));
41997 break;
41999 case V16QImode:
42000 case V32QImode:
42001 t1 = gen_reg_rtx (wmode);
42002 t2 = gen_reg_rtx (wmode);
42003 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42004 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42006 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42007 break;
42009 default:
42010 gcc_unreachable ();
42014 void
42015 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42017 rtx res_1, res_2, res_3, res_4;
42019 res_1 = gen_reg_rtx (V4SImode);
42020 res_2 = gen_reg_rtx (V4SImode);
42021 res_3 = gen_reg_rtx (V2DImode);
42022 res_4 = gen_reg_rtx (V2DImode);
42023 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42024 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42026 /* Move the results in element 2 down to element 1; we don't care
42027 what goes in elements 2 and 3. Then we can merge the parts
42028 back together with an interleave.
42030 Note that two other sequences were tried:
42031 (1) Use interleaves at the start instead of psrldq, which allows
42032 us to use a single shufps to merge things back at the end.
42033 (2) Use shufps here to combine the two vectors, then pshufd to
42034 put the elements in the correct order.
42035 In both cases the cost of the reformatting stall was too high
42036 and the overall sequence slower. */
42038 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42039 const0_rtx, const2_rtx,
42040 const0_rtx, const0_rtx));
42041 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42042 const0_rtx, const2_rtx,
42043 const0_rtx, const0_rtx));
42044 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42046 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42049 void
42050 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42052 enum machine_mode mode = GET_MODE (op0);
42053 rtx t1, t2, t3, t4, t5, t6;
42055 if (TARGET_XOP && mode == V2DImode)
42057 /* op1: A,B,C,D, op2: E,F,G,H */
42058 op1 = gen_lowpart (V4SImode, op1);
42059 op2 = gen_lowpart (V4SImode, op2);
42061 t1 = gen_reg_rtx (V4SImode);
42062 t2 = gen_reg_rtx (V4SImode);
42063 t3 = gen_reg_rtx (V2DImode);
42064 t4 = gen_reg_rtx (V2DImode);
42066 /* t1: B,A,D,C */
42067 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42068 GEN_INT (1),
42069 GEN_INT (0),
42070 GEN_INT (3),
42071 GEN_INT (2)));
42073 /* t2: (B*E),(A*F),(D*G),(C*H) */
42074 emit_insn (gen_mulv4si3 (t2, t1, op2));
42076 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42077 emit_insn (gen_xop_phadddq (t3, t2));
42079 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42080 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42082 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42083 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42085 else
42087 enum machine_mode nmode;
42088 rtx (*umul) (rtx, rtx, rtx);
42090 if (mode == V2DImode)
42092 umul = gen_vec_widen_umult_even_v4si;
42093 nmode = V4SImode;
42095 else if (mode == V4DImode)
42097 umul = gen_vec_widen_umult_even_v8si;
42098 nmode = V8SImode;
42100 else
42101 gcc_unreachable ();
42104 /* Multiply low parts. */
42105 t1 = gen_reg_rtx (mode);
42106 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42108 /* Shift input vectors right 32 bits so we can multiply high parts. */
42109 t6 = GEN_INT (32);
42110 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42111 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42113 /* Multiply high parts by low parts. */
42114 t4 = gen_reg_rtx (mode);
42115 t5 = gen_reg_rtx (mode);
42116 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42117 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42119 /* Combine and shift the highparts back. */
42120 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42121 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42123 /* Combine high and low parts. */
42124 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42127 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42128 gen_rtx_MULT (mode, op1, op2));
42131 /* Return 1 if control tansfer instruction INSN
42132 should be encoded with bnd prefix.
42133 If insn is NULL then return 1 when control
42134 transfer instructions should be prefixed with
42135 bnd by default for current function. */
42137 bool
42138 ix86_bnd_prefixed_insn_p (rtx insn ATTRIBUTE_UNUSED)
42140 return false;
42143 /* Calculate integer abs() using only SSE2 instructions. */
42145 void
42146 ix86_expand_sse2_abs (rtx target, rtx input)
42148 enum machine_mode mode = GET_MODE (target);
42149 rtx tmp0, tmp1, x;
42151 switch (mode)
42153 /* For 32-bit signed integer X, the best way to calculate the absolute
42154 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42155 case V4SImode:
42156 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42157 GEN_INT (GET_MODE_BITSIZE
42158 (GET_MODE_INNER (mode)) - 1),
42159 NULL, 0, OPTAB_DIRECT);
42160 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42161 NULL, 0, OPTAB_DIRECT);
42162 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42163 target, 0, OPTAB_DIRECT);
42164 break;
42166 /* For 16-bit signed integer X, the best way to calculate the absolute
42167 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42168 case V8HImode:
42169 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42171 x = expand_simple_binop (mode, SMAX, tmp0, input,
42172 target, 0, OPTAB_DIRECT);
42173 break;
42175 /* For 8-bit signed integer X, the best way to calculate the absolute
42176 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42177 as SSE2 provides the PMINUB insn. */
42178 case V16QImode:
42179 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42181 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42182 target, 0, OPTAB_DIRECT);
42183 break;
42185 default:
42186 gcc_unreachable ();
42189 if (x != target)
42190 emit_move_insn (target, x);
42193 /* Expand an insert into a vector register through pinsr insn.
42194 Return true if successful. */
42196 bool
42197 ix86_expand_pinsr (rtx *operands)
42199 rtx dst = operands[0];
42200 rtx src = operands[3];
42202 unsigned int size = INTVAL (operands[1]);
42203 unsigned int pos = INTVAL (operands[2]);
42205 if (GET_CODE (dst) == SUBREG)
42207 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42208 dst = SUBREG_REG (dst);
42211 if (GET_CODE (src) == SUBREG)
42212 src = SUBREG_REG (src);
42214 switch (GET_MODE (dst))
42216 case V16QImode:
42217 case V8HImode:
42218 case V4SImode:
42219 case V2DImode:
42221 enum machine_mode srcmode, dstmode;
42222 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42224 srcmode = mode_for_size (size, MODE_INT, 0);
42226 switch (srcmode)
42228 case QImode:
42229 if (!TARGET_SSE4_1)
42230 return false;
42231 dstmode = V16QImode;
42232 pinsr = gen_sse4_1_pinsrb;
42233 break;
42235 case HImode:
42236 if (!TARGET_SSE2)
42237 return false;
42238 dstmode = V8HImode;
42239 pinsr = gen_sse2_pinsrw;
42240 break;
42242 case SImode:
42243 if (!TARGET_SSE4_1)
42244 return false;
42245 dstmode = V4SImode;
42246 pinsr = gen_sse4_1_pinsrd;
42247 break;
42249 case DImode:
42250 gcc_assert (TARGET_64BIT);
42251 if (!TARGET_SSE4_1)
42252 return false;
42253 dstmode = V2DImode;
42254 pinsr = gen_sse4_1_pinsrq;
42255 break;
42257 default:
42258 return false;
42261 rtx d = dst;
42262 if (GET_MODE (dst) != dstmode)
42263 d = gen_reg_rtx (dstmode);
42264 src = gen_lowpart (srcmode, src);
42266 pos /= size;
42268 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42269 GEN_INT (1 << pos)));
42270 if (d != dst)
42271 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42272 return true;
42275 default:
42276 return false;
42280 /* This function returns the calling abi specific va_list type node.
42281 It returns the FNDECL specific va_list type. */
42283 static tree
42284 ix86_fn_abi_va_list (tree fndecl)
42286 if (!TARGET_64BIT)
42287 return va_list_type_node;
42288 gcc_assert (fndecl != NULL_TREE);
42290 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42291 return ms_va_list_type_node;
42292 else
42293 return sysv_va_list_type_node;
42296 /* Returns the canonical va_list type specified by TYPE. If there
42297 is no valid TYPE provided, it return NULL_TREE. */
42299 static tree
42300 ix86_canonical_va_list_type (tree type)
42302 tree wtype, htype;
42304 /* Resolve references and pointers to va_list type. */
42305 if (TREE_CODE (type) == MEM_REF)
42306 type = TREE_TYPE (type);
42307 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42308 type = TREE_TYPE (type);
42309 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42310 type = TREE_TYPE (type);
42312 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42314 wtype = va_list_type_node;
42315 gcc_assert (wtype != NULL_TREE);
42316 htype = type;
42317 if (TREE_CODE (wtype) == ARRAY_TYPE)
42319 /* If va_list is an array type, the argument may have decayed
42320 to a pointer type, e.g. by being passed to another function.
42321 In that case, unwrap both types so that we can compare the
42322 underlying records. */
42323 if (TREE_CODE (htype) == ARRAY_TYPE
42324 || POINTER_TYPE_P (htype))
42326 wtype = TREE_TYPE (wtype);
42327 htype = TREE_TYPE (htype);
42330 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42331 return va_list_type_node;
42332 wtype = sysv_va_list_type_node;
42333 gcc_assert (wtype != NULL_TREE);
42334 htype = type;
42335 if (TREE_CODE (wtype) == ARRAY_TYPE)
42337 /* If va_list is an array type, the argument may have decayed
42338 to a pointer type, e.g. by being passed to another function.
42339 In that case, unwrap both types so that we can compare the
42340 underlying records. */
42341 if (TREE_CODE (htype) == ARRAY_TYPE
42342 || POINTER_TYPE_P (htype))
42344 wtype = TREE_TYPE (wtype);
42345 htype = TREE_TYPE (htype);
42348 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42349 return sysv_va_list_type_node;
42350 wtype = ms_va_list_type_node;
42351 gcc_assert (wtype != NULL_TREE);
42352 htype = type;
42353 if (TREE_CODE (wtype) == ARRAY_TYPE)
42355 /* If va_list is an array type, the argument may have decayed
42356 to a pointer type, e.g. by being passed to another function.
42357 In that case, unwrap both types so that we can compare the
42358 underlying records. */
42359 if (TREE_CODE (htype) == ARRAY_TYPE
42360 || POINTER_TYPE_P (htype))
42362 wtype = TREE_TYPE (wtype);
42363 htype = TREE_TYPE (htype);
42366 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42367 return ms_va_list_type_node;
42368 return NULL_TREE;
42370 return std_canonical_va_list_type (type);
42373 /* Iterate through the target-specific builtin types for va_list.
42374 IDX denotes the iterator, *PTREE is set to the result type of
42375 the va_list builtin, and *PNAME to its internal type.
42376 Returns zero if there is no element for this index, otherwise
42377 IDX should be increased upon the next call.
42378 Note, do not iterate a base builtin's name like __builtin_va_list.
42379 Used from c_common_nodes_and_builtins. */
42381 static int
42382 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42384 if (TARGET_64BIT)
42386 switch (idx)
42388 default:
42389 break;
42391 case 0:
42392 *ptree = ms_va_list_type_node;
42393 *pname = "__builtin_ms_va_list";
42394 return 1;
42396 case 1:
42397 *ptree = sysv_va_list_type_node;
42398 *pname = "__builtin_sysv_va_list";
42399 return 1;
42403 return 0;
42406 #undef TARGET_SCHED_DISPATCH
42407 #define TARGET_SCHED_DISPATCH has_dispatch
42408 #undef TARGET_SCHED_DISPATCH_DO
42409 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42410 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42411 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42412 #undef TARGET_SCHED_REORDER
42413 #define TARGET_SCHED_REORDER ix86_sched_reorder
42414 #undef TARGET_SCHED_ADJUST_PRIORITY
42415 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42416 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42417 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42418 ix86_dependencies_evaluation_hook
42420 /* The size of the dispatch window is the total number of bytes of
42421 object code allowed in a window. */
42422 #define DISPATCH_WINDOW_SIZE 16
42424 /* Number of dispatch windows considered for scheduling. */
42425 #define MAX_DISPATCH_WINDOWS 3
42427 /* Maximum number of instructions in a window. */
42428 #define MAX_INSN 4
42430 /* Maximum number of immediate operands in a window. */
42431 #define MAX_IMM 4
42433 /* Maximum number of immediate bits allowed in a window. */
42434 #define MAX_IMM_SIZE 128
42436 /* Maximum number of 32 bit immediates allowed in a window. */
42437 #define MAX_IMM_32 4
42439 /* Maximum number of 64 bit immediates allowed in a window. */
42440 #define MAX_IMM_64 2
42442 /* Maximum total of loads or prefetches allowed in a window. */
42443 #define MAX_LOAD 2
42445 /* Maximum total of stores allowed in a window. */
42446 #define MAX_STORE 1
42448 #undef BIG
42449 #define BIG 100
42452 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42453 enum dispatch_group {
42454 disp_no_group = 0,
42455 disp_load,
42456 disp_store,
42457 disp_load_store,
42458 disp_prefetch,
42459 disp_imm,
42460 disp_imm_32,
42461 disp_imm_64,
42462 disp_branch,
42463 disp_cmp,
42464 disp_jcc,
42465 disp_last
42468 /* Number of allowable groups in a dispatch window. It is an array
42469 indexed by dispatch_group enum. 100 is used as a big number,
42470 because the number of these kind of operations does not have any
42471 effect in dispatch window, but we need them for other reasons in
42472 the table. */
42473 static unsigned int num_allowable_groups[disp_last] = {
42474 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42477 char group_name[disp_last + 1][16] = {
42478 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42479 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42480 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42483 /* Instruction path. */
42484 enum insn_path {
42485 no_path = 0,
42486 path_single, /* Single micro op. */
42487 path_double, /* Double micro op. */
42488 path_multi, /* Instructions with more than 2 micro op.. */
42489 last_path
42492 /* sched_insn_info defines a window to the instructions scheduled in
42493 the basic block. It contains a pointer to the insn_info table and
42494 the instruction scheduled.
42496 Windows are allocated for each basic block and are linked
42497 together. */
42498 typedef struct sched_insn_info_s {
42499 rtx insn;
42500 enum dispatch_group group;
42501 enum insn_path path;
42502 int byte_len;
42503 int imm_bytes;
42504 } sched_insn_info;
42506 /* Linked list of dispatch windows. This is a two way list of
42507 dispatch windows of a basic block. It contains information about
42508 the number of uops in the window and the total number of
42509 instructions and of bytes in the object code for this dispatch
42510 window. */
42511 typedef struct dispatch_windows_s {
42512 int num_insn; /* Number of insn in the window. */
42513 int num_uops; /* Number of uops in the window. */
42514 int window_size; /* Number of bytes in the window. */
42515 int window_num; /* Window number between 0 or 1. */
42516 int num_imm; /* Number of immediates in an insn. */
42517 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42518 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42519 int imm_size; /* Total immediates in the window. */
42520 int num_loads; /* Total memory loads in the window. */
42521 int num_stores; /* Total memory stores in the window. */
42522 int violation; /* Violation exists in window. */
42523 sched_insn_info *window; /* Pointer to the window. */
42524 struct dispatch_windows_s *next;
42525 struct dispatch_windows_s *prev;
42526 } dispatch_windows;
42528 /* Immediate valuse used in an insn. */
42529 typedef struct imm_info_s
42531 int imm;
42532 int imm32;
42533 int imm64;
42534 } imm_info;
42536 static dispatch_windows *dispatch_window_list;
42537 static dispatch_windows *dispatch_window_list1;
42539 /* Get dispatch group of insn. */
42541 static enum dispatch_group
42542 get_mem_group (rtx insn)
42544 enum attr_memory memory;
42546 if (INSN_CODE (insn) < 0)
42547 return disp_no_group;
42548 memory = get_attr_memory (insn);
42549 if (memory == MEMORY_STORE)
42550 return disp_store;
42552 if (memory == MEMORY_LOAD)
42553 return disp_load;
42555 if (memory == MEMORY_BOTH)
42556 return disp_load_store;
42558 return disp_no_group;
42561 /* Return true if insn is a compare instruction. */
42563 static bool
42564 is_cmp (rtx insn)
42566 enum attr_type type;
42568 type = get_attr_type (insn);
42569 return (type == TYPE_TEST
42570 || type == TYPE_ICMP
42571 || type == TYPE_FCMP
42572 || GET_CODE (PATTERN (insn)) == COMPARE);
42575 /* Return true if a dispatch violation encountered. */
42577 static bool
42578 dispatch_violation (void)
42580 if (dispatch_window_list->next)
42581 return dispatch_window_list->next->violation;
42582 return dispatch_window_list->violation;
42585 /* Return true if insn is a branch instruction. */
42587 static bool
42588 is_branch (rtx insn)
42590 return (CALL_P (insn) || JUMP_P (insn));
42593 /* Return true if insn is a prefetch instruction. */
42595 static bool
42596 is_prefetch (rtx insn)
42598 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42601 /* This function initializes a dispatch window and the list container holding a
42602 pointer to the window. */
42604 static void
42605 init_window (int window_num)
42607 int i;
42608 dispatch_windows *new_list;
42610 if (window_num == 0)
42611 new_list = dispatch_window_list;
42612 else
42613 new_list = dispatch_window_list1;
42615 new_list->num_insn = 0;
42616 new_list->num_uops = 0;
42617 new_list->window_size = 0;
42618 new_list->next = NULL;
42619 new_list->prev = NULL;
42620 new_list->window_num = window_num;
42621 new_list->num_imm = 0;
42622 new_list->num_imm_32 = 0;
42623 new_list->num_imm_64 = 0;
42624 new_list->imm_size = 0;
42625 new_list->num_loads = 0;
42626 new_list->num_stores = 0;
42627 new_list->violation = false;
42629 for (i = 0; i < MAX_INSN; i++)
42631 new_list->window[i].insn = NULL;
42632 new_list->window[i].group = disp_no_group;
42633 new_list->window[i].path = no_path;
42634 new_list->window[i].byte_len = 0;
42635 new_list->window[i].imm_bytes = 0;
42637 return;
42640 /* This function allocates and initializes a dispatch window and the
42641 list container holding a pointer to the window. */
42643 static dispatch_windows *
42644 allocate_window (void)
42646 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42647 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42649 return new_list;
42652 /* This routine initializes the dispatch scheduling information. It
42653 initiates building dispatch scheduler tables and constructs the
42654 first dispatch window. */
42656 static void
42657 init_dispatch_sched (void)
42659 /* Allocate a dispatch list and a window. */
42660 dispatch_window_list = allocate_window ();
42661 dispatch_window_list1 = allocate_window ();
42662 init_window (0);
42663 init_window (1);
42666 /* This function returns true if a branch is detected. End of a basic block
42667 does not have to be a branch, but here we assume only branches end a
42668 window. */
42670 static bool
42671 is_end_basic_block (enum dispatch_group group)
42673 return group == disp_branch;
42676 /* This function is called when the end of a window processing is reached. */
42678 static void
42679 process_end_window (void)
42681 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42682 if (dispatch_window_list->next)
42684 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42685 gcc_assert (dispatch_window_list->window_size
42686 + dispatch_window_list1->window_size <= 48);
42687 init_window (1);
42689 init_window (0);
42692 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42693 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42694 for 48 bytes of instructions. Note that these windows are not dispatch
42695 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42697 static dispatch_windows *
42698 allocate_next_window (int window_num)
42700 if (window_num == 0)
42702 if (dispatch_window_list->next)
42703 init_window (1);
42704 init_window (0);
42705 return dispatch_window_list;
42708 dispatch_window_list->next = dispatch_window_list1;
42709 dispatch_window_list1->prev = dispatch_window_list;
42711 return dispatch_window_list1;
42714 /* Increment the number of immediate operands of an instruction. */
42716 static int
42717 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42719 if (*in_rtx == 0)
42720 return 0;
42722 switch ( GET_CODE (*in_rtx))
42724 case CONST:
42725 case SYMBOL_REF:
42726 case CONST_INT:
42727 (imm_values->imm)++;
42728 if (x86_64_immediate_operand (*in_rtx, SImode))
42729 (imm_values->imm32)++;
42730 else
42731 (imm_values->imm64)++;
42732 break;
42734 case CONST_DOUBLE:
42735 (imm_values->imm)++;
42736 (imm_values->imm64)++;
42737 break;
42739 case CODE_LABEL:
42740 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42742 (imm_values->imm)++;
42743 (imm_values->imm32)++;
42745 break;
42747 default:
42748 break;
42751 return 0;
42754 /* Compute number of immediate operands of an instruction. */
42756 static void
42757 find_constant (rtx in_rtx, imm_info *imm_values)
42759 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42760 (rtx_function) find_constant_1, (void *) imm_values);
42763 /* Return total size of immediate operands of an instruction along with number
42764 of corresponding immediate-operands. It initializes its parameters to zero
42765 befor calling FIND_CONSTANT.
42766 INSN is the input instruction. IMM is the total of immediates.
42767 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42768 bit immediates. */
42770 static int
42771 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42773 imm_info imm_values = {0, 0, 0};
42775 find_constant (insn, &imm_values);
42776 *imm = imm_values.imm;
42777 *imm32 = imm_values.imm32;
42778 *imm64 = imm_values.imm64;
42779 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42782 /* This function indicates if an operand of an instruction is an
42783 immediate. */
42785 static bool
42786 has_immediate (rtx insn)
42788 int num_imm_operand;
42789 int num_imm32_operand;
42790 int num_imm64_operand;
42792 if (insn)
42793 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42794 &num_imm64_operand);
42795 return false;
42798 /* Return single or double path for instructions. */
42800 static enum insn_path
42801 get_insn_path (rtx insn)
42803 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42805 if ((int)path == 0)
42806 return path_single;
42808 if ((int)path == 1)
42809 return path_double;
42811 return path_multi;
42814 /* Return insn dispatch group. */
42816 static enum dispatch_group
42817 get_insn_group (rtx insn)
42819 enum dispatch_group group = get_mem_group (insn);
42820 if (group)
42821 return group;
42823 if (is_branch (insn))
42824 return disp_branch;
42826 if (is_cmp (insn))
42827 return disp_cmp;
42829 if (has_immediate (insn))
42830 return disp_imm;
42832 if (is_prefetch (insn))
42833 return disp_prefetch;
42835 return disp_no_group;
42838 /* Count number of GROUP restricted instructions in a dispatch
42839 window WINDOW_LIST. */
42841 static int
42842 count_num_restricted (rtx insn, dispatch_windows *window_list)
42844 enum dispatch_group group = get_insn_group (insn);
42845 int imm_size;
42846 int num_imm_operand;
42847 int num_imm32_operand;
42848 int num_imm64_operand;
42850 if (group == disp_no_group)
42851 return 0;
42853 if (group == disp_imm)
42855 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42856 &num_imm64_operand);
42857 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
42858 || num_imm_operand + window_list->num_imm > MAX_IMM
42859 || (num_imm32_operand > 0
42860 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
42861 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
42862 || (num_imm64_operand > 0
42863 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
42864 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
42865 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
42866 && num_imm64_operand > 0
42867 && ((window_list->num_imm_64 > 0
42868 && window_list->num_insn >= 2)
42869 || window_list->num_insn >= 3)))
42870 return BIG;
42872 return 1;
42875 if ((group == disp_load_store
42876 && (window_list->num_loads >= MAX_LOAD
42877 || window_list->num_stores >= MAX_STORE))
42878 || ((group == disp_load
42879 || group == disp_prefetch)
42880 && window_list->num_loads >= MAX_LOAD)
42881 || (group == disp_store
42882 && window_list->num_stores >= MAX_STORE))
42883 return BIG;
42885 return 1;
42888 /* This function returns true if insn satisfies dispatch rules on the
42889 last window scheduled. */
42891 static bool
42892 fits_dispatch_window (rtx insn)
42894 dispatch_windows *window_list = dispatch_window_list;
42895 dispatch_windows *window_list_next = dispatch_window_list->next;
42896 unsigned int num_restrict;
42897 enum dispatch_group group = get_insn_group (insn);
42898 enum insn_path path = get_insn_path (insn);
42899 int sum;
42901 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
42902 instructions should be given the lowest priority in the
42903 scheduling process in Haifa scheduler to make sure they will be
42904 scheduled in the same dispatch window as the reference to them. */
42905 if (group == disp_jcc || group == disp_cmp)
42906 return false;
42908 /* Check nonrestricted. */
42909 if (group == disp_no_group || group == disp_branch)
42910 return true;
42912 /* Get last dispatch window. */
42913 if (window_list_next)
42914 window_list = window_list_next;
42916 if (window_list->window_num == 1)
42918 sum = window_list->prev->window_size + window_list->window_size;
42920 if (sum == 32
42921 || (min_insn_size (insn) + sum) >= 48)
42922 /* Window 1 is full. Go for next window. */
42923 return true;
42926 num_restrict = count_num_restricted (insn, window_list);
42928 if (num_restrict > num_allowable_groups[group])
42929 return false;
42931 /* See if it fits in the first window. */
42932 if (window_list->window_num == 0)
42934 /* The first widow should have only single and double path
42935 uops. */
42936 if (path == path_double
42937 && (window_list->num_uops + 2) > MAX_INSN)
42938 return false;
42939 else if (path != path_single)
42940 return false;
42942 return true;
42945 /* Add an instruction INSN with NUM_UOPS micro-operations to the
42946 dispatch window WINDOW_LIST. */
42948 static void
42949 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
42951 int byte_len = min_insn_size (insn);
42952 int num_insn = window_list->num_insn;
42953 int imm_size;
42954 sched_insn_info *window = window_list->window;
42955 enum dispatch_group group = get_insn_group (insn);
42956 enum insn_path path = get_insn_path (insn);
42957 int num_imm_operand;
42958 int num_imm32_operand;
42959 int num_imm64_operand;
42961 if (!window_list->violation && group != disp_cmp
42962 && !fits_dispatch_window (insn))
42963 window_list->violation = true;
42965 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42966 &num_imm64_operand);
42968 /* Initialize window with new instruction. */
42969 window[num_insn].insn = insn;
42970 window[num_insn].byte_len = byte_len;
42971 window[num_insn].group = group;
42972 window[num_insn].path = path;
42973 window[num_insn].imm_bytes = imm_size;
42975 window_list->window_size += byte_len;
42976 window_list->num_insn = num_insn + 1;
42977 window_list->num_uops = window_list->num_uops + num_uops;
42978 window_list->imm_size += imm_size;
42979 window_list->num_imm += num_imm_operand;
42980 window_list->num_imm_32 += num_imm32_operand;
42981 window_list->num_imm_64 += num_imm64_operand;
42983 if (group == disp_store)
42984 window_list->num_stores += 1;
42985 else if (group == disp_load
42986 || group == disp_prefetch)
42987 window_list->num_loads += 1;
42988 else if (group == disp_load_store)
42990 window_list->num_stores += 1;
42991 window_list->num_loads += 1;
42995 /* Adds a scheduled instruction, INSN, to the current dispatch window.
42996 If the total bytes of instructions or the number of instructions in
42997 the window exceed allowable, it allocates a new window. */
42999 static void
43000 add_to_dispatch_window (rtx insn)
43002 int byte_len;
43003 dispatch_windows *window_list;
43004 dispatch_windows *next_list;
43005 dispatch_windows *window0_list;
43006 enum insn_path path;
43007 enum dispatch_group insn_group;
43008 bool insn_fits;
43009 int num_insn;
43010 int num_uops;
43011 int window_num;
43012 int insn_num_uops;
43013 int sum;
43015 if (INSN_CODE (insn) < 0)
43016 return;
43018 byte_len = min_insn_size (insn);
43019 window_list = dispatch_window_list;
43020 next_list = window_list->next;
43021 path = get_insn_path (insn);
43022 insn_group = get_insn_group (insn);
43024 /* Get the last dispatch window. */
43025 if (next_list)
43026 window_list = dispatch_window_list->next;
43028 if (path == path_single)
43029 insn_num_uops = 1;
43030 else if (path == path_double)
43031 insn_num_uops = 2;
43032 else
43033 insn_num_uops = (int) path;
43035 /* If current window is full, get a new window.
43036 Window number zero is full, if MAX_INSN uops are scheduled in it.
43037 Window number one is full, if window zero's bytes plus window
43038 one's bytes is 32, or if the bytes of the new instruction added
43039 to the total makes it greater than 48, or it has already MAX_INSN
43040 instructions in it. */
43041 num_insn = window_list->num_insn;
43042 num_uops = window_list->num_uops;
43043 window_num = window_list->window_num;
43044 insn_fits = fits_dispatch_window (insn);
43046 if (num_insn >= MAX_INSN
43047 || num_uops + insn_num_uops > MAX_INSN
43048 || !(insn_fits))
43050 window_num = ~window_num & 1;
43051 window_list = allocate_next_window (window_num);
43054 if (window_num == 0)
43056 add_insn_window (insn, window_list, insn_num_uops);
43057 if (window_list->num_insn >= MAX_INSN
43058 && insn_group == disp_branch)
43060 process_end_window ();
43061 return;
43064 else if (window_num == 1)
43066 window0_list = window_list->prev;
43067 sum = window0_list->window_size + window_list->window_size;
43068 if (sum == 32
43069 || (byte_len + sum) >= 48)
43071 process_end_window ();
43072 window_list = dispatch_window_list;
43075 add_insn_window (insn, window_list, insn_num_uops);
43077 else
43078 gcc_unreachable ();
43080 if (is_end_basic_block (insn_group))
43082 /* End of basic block is reached do end-basic-block process. */
43083 process_end_window ();
43084 return;
43088 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43090 DEBUG_FUNCTION static void
43091 debug_dispatch_window_file (FILE *file, int window_num)
43093 dispatch_windows *list;
43094 int i;
43096 if (window_num == 0)
43097 list = dispatch_window_list;
43098 else
43099 list = dispatch_window_list1;
43101 fprintf (file, "Window #%d:\n", list->window_num);
43102 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43103 list->num_insn, list->num_uops, list->window_size);
43104 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43105 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43107 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43108 list->num_stores);
43109 fprintf (file, " insn info:\n");
43111 for (i = 0; i < MAX_INSN; i++)
43113 if (!list->window[i].insn)
43114 break;
43115 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43116 i, group_name[list->window[i].group],
43117 i, (void *)list->window[i].insn,
43118 i, list->window[i].path,
43119 i, list->window[i].byte_len,
43120 i, list->window[i].imm_bytes);
43124 /* Print to stdout a dispatch window. */
43126 DEBUG_FUNCTION void
43127 debug_dispatch_window (int window_num)
43129 debug_dispatch_window_file (stdout, window_num);
43132 /* Print INSN dispatch information to FILE. */
43134 DEBUG_FUNCTION static void
43135 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43137 int byte_len;
43138 enum insn_path path;
43139 enum dispatch_group group;
43140 int imm_size;
43141 int num_imm_operand;
43142 int num_imm32_operand;
43143 int num_imm64_operand;
43145 if (INSN_CODE (insn) < 0)
43146 return;
43148 byte_len = min_insn_size (insn);
43149 path = get_insn_path (insn);
43150 group = get_insn_group (insn);
43151 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43152 &num_imm64_operand);
43154 fprintf (file, " insn info:\n");
43155 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43156 group_name[group], path, byte_len);
43157 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43158 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43161 /* Print to STDERR the status of the ready list with respect to
43162 dispatch windows. */
43164 DEBUG_FUNCTION void
43165 debug_ready_dispatch (void)
43167 int i;
43168 int no_ready = number_in_ready ();
43170 fprintf (stdout, "Number of ready: %d\n", no_ready);
43172 for (i = 0; i < no_ready; i++)
43173 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43176 /* This routine is the driver of the dispatch scheduler. */
43178 static void
43179 do_dispatch (rtx insn, int mode)
43181 if (mode == DISPATCH_INIT)
43182 init_dispatch_sched ();
43183 else if (mode == ADD_TO_DISPATCH_WINDOW)
43184 add_to_dispatch_window (insn);
43187 /* Return TRUE if Dispatch Scheduling is supported. */
43189 static bool
43190 has_dispatch (rtx insn, int action)
43192 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
43193 && flag_dispatch_scheduler)
43194 switch (action)
43196 default:
43197 return false;
43199 case IS_DISPATCH_ON:
43200 return true;
43201 break;
43203 case IS_CMP:
43204 return is_cmp (insn);
43206 case DISPATCH_VIOLATION:
43207 return dispatch_violation ();
43209 case FITS_DISPATCH_WINDOW:
43210 return fits_dispatch_window (insn);
43213 return false;
43216 /* Implementation of reassociation_width target hook used by
43217 reassoc phase to identify parallelism level in reassociated
43218 tree. Statements tree_code is passed in OPC. Arguments type
43219 is passed in MODE.
43221 Currently parallel reassociation is enabled for Atom
43222 processors only and we set reassociation width to be 2
43223 because Atom may issue up to 2 instructions per cycle.
43225 Return value should be fixed if parallel reassociation is
43226 enabled for other processors. */
43228 static int
43229 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43230 enum machine_mode mode)
43232 int res = 1;
43234 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43235 res = 2;
43236 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43237 res = 2;
43239 return res;
43242 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43243 place emms and femms instructions. */
43245 static enum machine_mode
43246 ix86_preferred_simd_mode (enum machine_mode mode)
43248 if (!TARGET_SSE)
43249 return word_mode;
43251 switch (mode)
43253 case QImode:
43254 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43255 case HImode:
43256 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43257 case SImode:
43258 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43259 case DImode:
43260 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43262 case SFmode:
43263 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43264 return V8SFmode;
43265 else
43266 return V4SFmode;
43268 case DFmode:
43269 if (!TARGET_VECTORIZE_DOUBLE)
43270 return word_mode;
43271 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43272 return V4DFmode;
43273 else if (TARGET_SSE2)
43274 return V2DFmode;
43275 /* FALLTHRU */
43277 default:
43278 return word_mode;
43282 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43283 vectors. */
43285 static unsigned int
43286 ix86_autovectorize_vector_sizes (void)
43288 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43293 /* Return class of registers which could be used for pseudo of MODE
43294 and of class RCLASS for spilling instead of memory. Return NO_REGS
43295 if it is not possible or non-profitable. */
43296 static reg_class_t
43297 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43299 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43300 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43301 && INTEGER_CLASS_P (rclass))
43302 return ALL_SSE_REGS;
43303 return NO_REGS;
43306 /* Implement targetm.vectorize.init_cost. */
43308 static void *
43309 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43311 unsigned *cost = XNEWVEC (unsigned, 3);
43312 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43313 return cost;
43316 /* Implement targetm.vectorize.add_stmt_cost. */
43318 static unsigned
43319 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43320 struct _stmt_vec_info *stmt_info, int misalign,
43321 enum vect_cost_model_location where)
43323 unsigned *cost = (unsigned *) data;
43324 unsigned retval = 0;
43326 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43327 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43329 /* Statements in an inner loop relative to the loop being
43330 vectorized are weighted more heavily. The value here is
43331 arbitrary and could potentially be improved with analysis. */
43332 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43333 count *= 50; /* FIXME. */
43335 retval = (unsigned) (count * stmt_cost);
43336 cost[where] += retval;
43338 return retval;
43341 /* Implement targetm.vectorize.finish_cost. */
43343 static void
43344 ix86_finish_cost (void *data, unsigned *prologue_cost,
43345 unsigned *body_cost, unsigned *epilogue_cost)
43347 unsigned *cost = (unsigned *) data;
43348 *prologue_cost = cost[vect_prologue];
43349 *body_cost = cost[vect_body];
43350 *epilogue_cost = cost[vect_epilogue];
43353 /* Implement targetm.vectorize.destroy_cost_data. */
43355 static void
43356 ix86_destroy_cost_data (void *data)
43358 free (data);
43361 /* Validate target specific memory model bits in VAL. */
43363 static unsigned HOST_WIDE_INT
43364 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43366 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43367 bool strong;
43369 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43370 |MEMMODEL_MASK)
43371 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43373 warning (OPT_Winvalid_memory_model,
43374 "Unknown architecture specific memory model");
43375 return MEMMODEL_SEQ_CST;
43377 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43378 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43380 warning (OPT_Winvalid_memory_model,
43381 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43382 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43384 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43386 warning (OPT_Winvalid_memory_model,
43387 "HLE_RELEASE not used with RELEASE or stronger memory model");
43388 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43390 return val;
43393 /* Initialize the GCC target structure. */
43394 #undef TARGET_RETURN_IN_MEMORY
43395 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43397 #undef TARGET_LEGITIMIZE_ADDRESS
43398 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43400 #undef TARGET_ATTRIBUTE_TABLE
43401 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43402 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43403 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43404 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43405 # undef TARGET_MERGE_DECL_ATTRIBUTES
43406 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43407 #endif
43409 #undef TARGET_COMP_TYPE_ATTRIBUTES
43410 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43412 #undef TARGET_INIT_BUILTINS
43413 #define TARGET_INIT_BUILTINS ix86_init_builtins
43414 #undef TARGET_BUILTIN_DECL
43415 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43416 #undef TARGET_EXPAND_BUILTIN
43417 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43419 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43420 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43421 ix86_builtin_vectorized_function
43423 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43424 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43426 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43427 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43429 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43430 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43432 #undef TARGET_BUILTIN_RECIPROCAL
43433 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43435 #undef TARGET_ASM_FUNCTION_EPILOGUE
43436 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43438 #undef TARGET_ENCODE_SECTION_INFO
43439 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43440 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43441 #else
43442 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43443 #endif
43445 #undef TARGET_ASM_OPEN_PAREN
43446 #define TARGET_ASM_OPEN_PAREN ""
43447 #undef TARGET_ASM_CLOSE_PAREN
43448 #define TARGET_ASM_CLOSE_PAREN ""
43450 #undef TARGET_ASM_BYTE_OP
43451 #define TARGET_ASM_BYTE_OP ASM_BYTE
43453 #undef TARGET_ASM_ALIGNED_HI_OP
43454 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43455 #undef TARGET_ASM_ALIGNED_SI_OP
43456 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43457 #ifdef ASM_QUAD
43458 #undef TARGET_ASM_ALIGNED_DI_OP
43459 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43460 #endif
43462 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43463 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43465 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43466 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43468 #undef TARGET_ASM_UNALIGNED_HI_OP
43469 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43470 #undef TARGET_ASM_UNALIGNED_SI_OP
43471 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43472 #undef TARGET_ASM_UNALIGNED_DI_OP
43473 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43475 #undef TARGET_PRINT_OPERAND
43476 #define TARGET_PRINT_OPERAND ix86_print_operand
43477 #undef TARGET_PRINT_OPERAND_ADDRESS
43478 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43479 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43480 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43481 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43482 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43484 #undef TARGET_SCHED_INIT_GLOBAL
43485 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43486 #undef TARGET_SCHED_ADJUST_COST
43487 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43488 #undef TARGET_SCHED_ISSUE_RATE
43489 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43490 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43491 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43492 ia32_multipass_dfa_lookahead
43494 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43495 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43497 #undef TARGET_MEMMODEL_CHECK
43498 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43500 #ifdef HAVE_AS_TLS
43501 #undef TARGET_HAVE_TLS
43502 #define TARGET_HAVE_TLS true
43503 #endif
43504 #undef TARGET_CANNOT_FORCE_CONST_MEM
43505 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43506 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43507 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43509 #undef TARGET_DELEGITIMIZE_ADDRESS
43510 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43512 #undef TARGET_MS_BITFIELD_LAYOUT_P
43513 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43515 #if TARGET_MACHO
43516 #undef TARGET_BINDS_LOCAL_P
43517 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43518 #endif
43519 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43520 #undef TARGET_BINDS_LOCAL_P
43521 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43522 #endif
43524 #undef TARGET_ASM_OUTPUT_MI_THUNK
43525 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43526 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43527 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43529 #undef TARGET_ASM_FILE_START
43530 #define TARGET_ASM_FILE_START x86_file_start
43532 #undef TARGET_OPTION_OVERRIDE
43533 #define TARGET_OPTION_OVERRIDE ix86_option_override
43535 #undef TARGET_REGISTER_MOVE_COST
43536 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43537 #undef TARGET_MEMORY_MOVE_COST
43538 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43539 #undef TARGET_RTX_COSTS
43540 #define TARGET_RTX_COSTS ix86_rtx_costs
43541 #undef TARGET_ADDRESS_COST
43542 #define TARGET_ADDRESS_COST ix86_address_cost
43544 #undef TARGET_FIXED_CONDITION_CODE_REGS
43545 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43546 #undef TARGET_CC_MODES_COMPATIBLE
43547 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43549 #undef TARGET_MACHINE_DEPENDENT_REORG
43550 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43552 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43553 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43555 #undef TARGET_BUILD_BUILTIN_VA_LIST
43556 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43558 #undef TARGET_FOLD_BUILTIN
43559 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43561 #undef TARGET_COMPARE_VERSION_PRIORITY
43562 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43564 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43565 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43566 ix86_generate_version_dispatcher_body
43568 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43569 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43570 ix86_get_function_versions_dispatcher
43572 #undef TARGET_ENUM_VA_LIST_P
43573 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43575 #undef TARGET_FN_ABI_VA_LIST
43576 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43578 #undef TARGET_CANONICAL_VA_LIST_TYPE
43579 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43581 #undef TARGET_EXPAND_BUILTIN_VA_START
43582 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43584 #undef TARGET_MD_ASM_CLOBBERS
43585 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43587 #undef TARGET_PROMOTE_PROTOTYPES
43588 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43589 #undef TARGET_STRUCT_VALUE_RTX
43590 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43591 #undef TARGET_SETUP_INCOMING_VARARGS
43592 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43593 #undef TARGET_MUST_PASS_IN_STACK
43594 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43595 #undef TARGET_FUNCTION_ARG_ADVANCE
43596 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43597 #undef TARGET_FUNCTION_ARG
43598 #define TARGET_FUNCTION_ARG ix86_function_arg
43599 #undef TARGET_FUNCTION_ARG_BOUNDARY
43600 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43601 #undef TARGET_PASS_BY_REFERENCE
43602 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43603 #undef TARGET_INTERNAL_ARG_POINTER
43604 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43605 #undef TARGET_UPDATE_STACK_BOUNDARY
43606 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43607 #undef TARGET_GET_DRAP_RTX
43608 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
43609 #undef TARGET_STRICT_ARGUMENT_NAMING
43610 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
43611 #undef TARGET_STATIC_CHAIN
43612 #define TARGET_STATIC_CHAIN ix86_static_chain
43613 #undef TARGET_TRAMPOLINE_INIT
43614 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
43615 #undef TARGET_RETURN_POPS_ARGS
43616 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43618 #undef TARGET_LEGITIMATE_COMBINED_INSN
43619 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43621 #undef TARGET_ASAN_SHADOW_OFFSET
43622 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43624 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43625 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43627 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43628 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43630 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43631 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43633 #undef TARGET_C_MODE_FOR_SUFFIX
43634 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43636 #ifdef HAVE_AS_TLS
43637 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43638 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43639 #endif
43641 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43642 #undef TARGET_INSERT_ATTRIBUTES
43643 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43644 #endif
43646 #undef TARGET_MANGLE_TYPE
43647 #define TARGET_MANGLE_TYPE ix86_mangle_type
43649 #if !TARGET_MACHO
43650 #undef TARGET_STACK_PROTECT_FAIL
43651 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43652 #endif
43654 #undef TARGET_FUNCTION_VALUE
43655 #define TARGET_FUNCTION_VALUE ix86_function_value
43657 #undef TARGET_FUNCTION_VALUE_REGNO_P
43658 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43660 #undef TARGET_PROMOTE_FUNCTION_MODE
43661 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43663 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43664 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43666 #undef TARGET_INSTANTIATE_DECLS
43667 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43669 #undef TARGET_SECONDARY_RELOAD
43670 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43672 #undef TARGET_CLASS_MAX_NREGS
43673 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43675 #undef TARGET_PREFERRED_RELOAD_CLASS
43676 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43677 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43678 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43679 #undef TARGET_CLASS_LIKELY_SPILLED_P
43680 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43682 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43683 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43684 ix86_builtin_vectorization_cost
43685 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43686 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43687 ix86_vectorize_vec_perm_const_ok
43688 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43689 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43690 ix86_preferred_simd_mode
43691 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43692 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43693 ix86_autovectorize_vector_sizes
43694 #undef TARGET_VECTORIZE_INIT_COST
43695 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
43696 #undef TARGET_VECTORIZE_ADD_STMT_COST
43697 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
43698 #undef TARGET_VECTORIZE_FINISH_COST
43699 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
43700 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
43701 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
43703 #undef TARGET_SET_CURRENT_FUNCTION
43704 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
43706 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
43707 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
43709 #undef TARGET_OPTION_SAVE
43710 #define TARGET_OPTION_SAVE ix86_function_specific_save
43712 #undef TARGET_OPTION_RESTORE
43713 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
43715 #undef TARGET_OPTION_PRINT
43716 #define TARGET_OPTION_PRINT ix86_function_specific_print
43718 #undef TARGET_OPTION_FUNCTION_VERSIONS
43719 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
43721 #undef TARGET_CAN_INLINE_P
43722 #define TARGET_CAN_INLINE_P ix86_can_inline_p
43724 #undef TARGET_EXPAND_TO_RTL_HOOK
43725 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
43727 #undef TARGET_LEGITIMATE_ADDRESS_P
43728 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
43730 #undef TARGET_LRA_P
43731 #define TARGET_LRA_P hook_bool_void_true
43733 #undef TARGET_REGISTER_PRIORITY
43734 #define TARGET_REGISTER_PRIORITY ix86_register_priority
43736 #undef TARGET_REGISTER_USAGE_LEVELING_P
43737 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
43739 #undef TARGET_LEGITIMATE_CONSTANT_P
43740 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
43742 #undef TARGET_FRAME_POINTER_REQUIRED
43743 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
43745 #undef TARGET_CAN_ELIMINATE
43746 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
43748 #undef TARGET_EXTRA_LIVE_ON_ENTRY
43749 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
43751 #undef TARGET_ASM_CODE_END
43752 #define TARGET_ASM_CODE_END ix86_code_end
43754 #undef TARGET_CONDITIONAL_REGISTER_USAGE
43755 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
43757 #if TARGET_MACHO
43758 #undef TARGET_INIT_LIBFUNCS
43759 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
43760 #endif
43762 #undef TARGET_SPILL_CLASS
43763 #define TARGET_SPILL_CLASS ix86_spill_class
43765 struct gcc_target targetm = TARGET_INITIALIZER;
43767 #include "gt-i386.h"