2016-08-04 Andrew Pinski <apinski@cavium.com>
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob2129a5de7656702931f38f450674178de7fd966d
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "diagnostic.h"
40 #include "insn-attr.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "stor-layout.h"
44 #include "calls.h"
45 #include "varasm.h"
46 #include "output.h"
47 #include "flags.h"
48 #include "explow.h"
49 #include "expr.h"
50 #include "reload.h"
51 #include "langhooks.h"
52 #include "opts.h"
53 #include "params.h"
54 #include "gimplify.h"
55 #include "dwarf2.h"
56 #include "gimple-iterator.h"
57 #include "tree-vectorizer.h"
58 #include "aarch64-cost-tables.h"
59 #include "dumpfile.h"
60 #include "builtins.h"
61 #include "rtl-iter.h"
62 #include "tm-constrs.h"
63 #include "sched-int.h"
64 #include "cortex-a57-fma-steering.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 /* This file should be included last. */
69 #include "target-def.h"
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
74 /* Classifies an address.
76 ADDRESS_REG_IMM
77 A simple base register plus immediate offset.
79 ADDRESS_REG_WB
80 A base register indexed by immediate offset with writeback.
82 ADDRESS_REG_REG
83 A base register indexed by (optionally scaled) register.
85 ADDRESS_REG_UXTW
86 A base register indexed by (optionally scaled) zero-extended register.
88 ADDRESS_REG_SXTW
89 A base register indexed by (optionally scaled) sign-extended register.
91 ADDRESS_LO_SUM
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
94 ADDRESS_SYMBOLIC:
95 A constant symbolic address, in pc-relative literal pool. */
97 enum aarch64_address_type {
98 ADDRESS_REG_IMM,
99 ADDRESS_REG_WB,
100 ADDRESS_REG_REG,
101 ADDRESS_REG_UXTW,
102 ADDRESS_REG_SXTW,
103 ADDRESS_LO_SUM,
104 ADDRESS_SYMBOLIC
107 struct aarch64_address_info {
108 enum aarch64_address_type type;
109 rtx base;
110 rtx offset;
111 int shift;
112 enum aarch64_symbol_type symbol_type;
115 struct simd_immediate_info
117 rtx value;
118 int shift;
119 int element_width;
120 bool mvn;
121 bool msl;
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel;
127 #ifdef HAVE_AS_TLS
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
130 #endif
132 static bool aarch64_composite_type_p (const_tree, machine_mode);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
134 const_tree,
135 machine_mode *, int *,
136 bool *);
137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
142 const unsigned char *sel);
143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version;
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune = cortexa53;
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags = 0;
154 /* Global flag for PC relative loads. */
155 bool aarch64_pcrelative_literal_loads;
157 /* Support for command line parsing of boolean flags in the tuning
158 structures. */
159 struct aarch64_flag_desc
161 const char* name;
162 unsigned int flag;
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
169 { "none", AARCH64_FUSE_NOTHING },
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL },
172 { NULL, AARCH64_FUSE_NOTHING }
174 #undef AARCH64_FUION_PAIR
176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
177 { name, AARCH64_EXTRA_TUNE_##internal_name },
178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
180 { "none", AARCH64_EXTRA_TUNE_NONE },
181 #include "aarch64-tuning-flags.def"
182 { "all", AARCH64_EXTRA_TUNE_ALL },
183 { NULL, AARCH64_EXTRA_TUNE_NONE }
185 #undef AARCH64_EXTRA_TUNING_OPTION
187 /* Tuning parameters. */
189 static const struct cpu_addrcost_table generic_addrcost_table =
192 0, /* hi */
193 0, /* si */
194 0, /* di */
195 0, /* ti */
197 0, /* pre_modify */
198 0, /* post_modify */
199 0, /* register_offset */
200 0, /* register_sextend */
201 0, /* register_zextend */
202 0 /* imm_offset */
205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
208 1, /* hi */
209 0, /* si */
210 0, /* di */
211 1, /* ti */
213 0, /* pre_modify */
214 0, /* post_modify */
215 0, /* register_offset */
216 0, /* register_sextend */
217 0, /* register_zextend */
218 0, /* imm_offset */
221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
224 0, /* hi */
225 0, /* si */
226 0, /* di */
227 2, /* ti */
229 0, /* pre_modify */
230 0, /* post_modify */
231 1, /* register_offset */
232 1, /* register_sextend */
233 2, /* register_zextend */
234 0, /* imm_offset */
237 static const struct cpu_addrcost_table xgene1_addrcost_table =
240 1, /* hi */
241 0, /* si */
242 0, /* di */
243 1, /* ti */
245 1, /* pre_modify */
246 0, /* post_modify */
247 0, /* register_offset */
248 1, /* register_sextend */
249 1, /* register_zextend */
250 0, /* imm_offset */
253 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
269 static const struct cpu_addrcost_table vulcan_addrcost_table =
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
277 0, /* pre_modify */
278 0, /* post_modify */
279 2, /* register_offset */
280 3, /* register_sextend */
281 3, /* register_zextend */
282 0, /* imm_offset */
285 static const struct cpu_regmove_cost generic_regmove_cost =
287 1, /* GP2GP */
288 /* Avoid the use of slow int<->fp moves for spilling by setting
289 their cost higher than memmov_cost. */
290 5, /* GP2FP */
291 5, /* FP2GP */
292 2 /* FP2FP */
295 static const struct cpu_regmove_cost cortexa57_regmove_cost =
297 1, /* GP2GP */
298 /* Avoid the use of slow int<->fp moves for spilling by setting
299 their cost higher than memmov_cost. */
300 5, /* GP2FP */
301 5, /* FP2GP */
302 2 /* FP2FP */
305 static const struct cpu_regmove_cost cortexa53_regmove_cost =
307 1, /* GP2GP */
308 /* Avoid the use of slow int<->fp moves for spilling by setting
309 their cost higher than memmov_cost. */
310 5, /* GP2FP */
311 5, /* FP2GP */
312 2 /* FP2FP */
315 static const struct cpu_regmove_cost exynosm1_regmove_cost =
317 1, /* GP2GP */
318 /* Avoid the use of slow int<->fp moves for spilling by setting
319 their cost higher than memmov_cost (actual, 4 and 9). */
320 9, /* GP2FP */
321 9, /* FP2GP */
322 1 /* FP2FP */
325 static const struct cpu_regmove_cost thunderx_regmove_cost =
327 2, /* GP2GP */
328 2, /* GP2FP */
329 6, /* FP2GP */
330 4 /* FP2FP */
333 static const struct cpu_regmove_cost xgene1_regmove_cost =
335 1, /* GP2GP */
336 /* Avoid the use of slow int<->fp moves for spilling by setting
337 their cost higher than memmov_cost. */
338 8, /* GP2FP */
339 8, /* FP2GP */
340 2 /* FP2FP */
343 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
345 2, /* GP2GP */
346 /* Avoid the use of int<->fp moves for spilling. */
347 6, /* GP2FP */
348 6, /* FP2GP */
349 4 /* FP2FP */
352 static const struct cpu_regmove_cost vulcan_regmove_cost =
354 1, /* GP2GP */
355 /* Avoid the use of int<->fp moves for spilling. */
356 8, /* GP2FP */
357 8, /* FP2GP */
358 4 /* FP2FP */
361 /* Generic costs for vector insn classes. */
362 static const struct cpu_vector_cost generic_vector_cost =
364 1, /* scalar_stmt_cost */
365 1, /* scalar_load_cost */
366 1, /* scalar_store_cost */
367 1, /* vec_stmt_cost */
368 2, /* vec_permute_cost */
369 1, /* vec_to_scalar_cost */
370 1, /* scalar_to_vec_cost */
371 1, /* vec_align_load_cost */
372 1, /* vec_unalign_load_cost */
373 1, /* vec_unalign_store_cost */
374 1, /* vec_store_cost */
375 3, /* cond_taken_branch_cost */
376 1 /* cond_not_taken_branch_cost */
379 /* ThunderX costs for vector insn classes. */
380 static const struct cpu_vector_cost thunderx_vector_cost =
382 1, /* scalar_stmt_cost */
383 3, /* scalar_load_cost */
384 1, /* scalar_store_cost */
385 4, /* vec_stmt_cost */
386 4, /* vec_permute_cost */
387 2, /* vec_to_scalar_cost */
388 2, /* scalar_to_vec_cost */
389 3, /* vec_align_load_cost */
390 10, /* vec_unalign_load_cost */
391 10, /* vec_unalign_store_cost */
392 1, /* vec_store_cost */
393 3, /* cond_taken_branch_cost */
394 3 /* cond_not_taken_branch_cost */
397 /* Generic costs for vector insn classes. */
398 static const struct cpu_vector_cost cortexa57_vector_cost =
400 1, /* scalar_stmt_cost */
401 4, /* scalar_load_cost */
402 1, /* scalar_store_cost */
403 3, /* vec_stmt_cost */
404 3, /* vec_permute_cost */
405 8, /* vec_to_scalar_cost */
406 8, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 static const struct cpu_vector_cost exynosm1_vector_cost =
417 1, /* scalar_stmt_cost */
418 5, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 3, /* vec_stmt_cost */
421 3, /* vec_permute_cost */
422 3, /* vec_to_scalar_cost */
423 3, /* scalar_to_vec_cost */
424 5, /* vec_align_load_cost */
425 5, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 1, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* Generic costs for vector insn classes. */
433 static const struct cpu_vector_cost xgene1_vector_cost =
435 1, /* scalar_stmt_cost */
436 5, /* scalar_load_cost */
437 1, /* scalar_store_cost */
438 2, /* vec_stmt_cost */
439 2, /* vec_permute_cost */
440 4, /* vec_to_scalar_cost */
441 4, /* scalar_to_vec_cost */
442 10, /* vec_align_load_cost */
443 10, /* vec_unalign_load_cost */
444 2, /* vec_unalign_store_cost */
445 2, /* vec_store_cost */
446 2, /* cond_taken_branch_cost */
447 1 /* cond_not_taken_branch_cost */
450 /* Costs for vector insn classes for Vulcan. */
451 static const struct cpu_vector_cost vulcan_vector_cost =
453 6, /* scalar_stmt_cost */
454 4, /* scalar_load_cost */
455 1, /* scalar_store_cost */
456 6, /* vec_stmt_cost */
457 3, /* vec_permute_cost */
458 6, /* vec_to_scalar_cost */
459 5, /* scalar_to_vec_cost */
460 8, /* vec_align_load_cost */
461 8, /* vec_unalign_load_cost */
462 4, /* vec_unalign_store_cost */
463 4, /* vec_store_cost */
464 2, /* cond_taken_branch_cost */
465 1 /* cond_not_taken_branch_cost */
468 /* Generic costs for branch instructions. */
469 static const struct cpu_branch_cost generic_branch_cost =
471 2, /* Predictable. */
472 2 /* Unpredictable. */
475 /* Branch costs for Cortex-A57. */
476 static const struct cpu_branch_cost cortexa57_branch_cost =
478 1, /* Predictable. */
479 3 /* Unpredictable. */
482 /* Branch costs for Vulcan. */
483 static const struct cpu_branch_cost vulcan_branch_cost =
485 1, /* Predictable. */
486 3 /* Unpredictable. */
489 /* Generic approximation modes. */
490 static const cpu_approx_modes generic_approx_modes =
492 AARCH64_APPROX_NONE, /* division */
493 AARCH64_APPROX_NONE, /* sqrt */
494 AARCH64_APPROX_NONE /* recip_sqrt */
497 /* Approximation modes for Exynos M1. */
498 static const cpu_approx_modes exynosm1_approx_modes =
500 AARCH64_APPROX_NONE, /* division */
501 AARCH64_APPROX_ALL, /* sqrt */
502 AARCH64_APPROX_ALL /* recip_sqrt */
505 /* Approximation modes for X-Gene 1. */
506 static const cpu_approx_modes xgene1_approx_modes =
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_ALL /* recip_sqrt */
513 static const struct tune_params generic_tunings =
515 &cortexa57_extra_costs,
516 &generic_addrcost_table,
517 &generic_regmove_cost,
518 &generic_vector_cost,
519 &generic_branch_cost,
520 &generic_approx_modes,
521 4, /* memmov_cost */
522 2, /* issue_rate */
523 AARCH64_FUSE_NOTHING, /* fusible_ops */
524 8, /* function_align. */
525 8, /* jump_align. */
526 4, /* loop_align. */
527 2, /* int_reassoc_width. */
528 4, /* fp_reassoc_width. */
529 1, /* vec_reassoc_width. */
530 2, /* min_div_recip_mul_sf. */
531 2, /* min_div_recip_mul_df. */
532 0, /* max_case_values. */
533 0, /* cache_line_size. */
534 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
535 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
538 static const struct tune_params cortexa35_tunings =
540 &cortexa53_extra_costs,
541 &generic_addrcost_table,
542 &cortexa53_regmove_cost,
543 &generic_vector_cost,
544 &cortexa57_branch_cost,
545 &generic_approx_modes,
546 4, /* memmov_cost */
547 1, /* issue_rate */
548 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
549 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
550 16, /* function_align. */
551 8, /* jump_align. */
552 8, /* loop_align. */
553 2, /* int_reassoc_width. */
554 4, /* fp_reassoc_width. */
555 1, /* vec_reassoc_width. */
556 2, /* min_div_recip_mul_sf. */
557 2, /* min_div_recip_mul_df. */
558 0, /* max_case_values. */
559 0, /* cache_line_size. */
560 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
561 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
564 static const struct tune_params cortexa53_tunings =
566 &cortexa53_extra_costs,
567 &generic_addrcost_table,
568 &cortexa53_regmove_cost,
569 &generic_vector_cost,
570 &cortexa57_branch_cost,
571 &generic_approx_modes,
572 4, /* memmov_cost */
573 2, /* issue_rate */
574 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
575 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
576 16, /* function_align. */
577 8, /* jump_align. */
578 8, /* loop_align. */
579 2, /* int_reassoc_width. */
580 4, /* fp_reassoc_width. */
581 1, /* vec_reassoc_width. */
582 2, /* min_div_recip_mul_sf. */
583 2, /* min_div_recip_mul_df. */
584 0, /* max_case_values. */
585 0, /* cache_line_size. */
586 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
587 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
590 static const struct tune_params cortexa57_tunings =
592 &cortexa57_extra_costs,
593 &cortexa57_addrcost_table,
594 &cortexa57_regmove_cost,
595 &cortexa57_vector_cost,
596 &cortexa57_branch_cost,
597 &generic_approx_modes,
598 4, /* memmov_cost */
599 3, /* issue_rate */
600 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
601 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
602 16, /* function_align. */
603 8, /* jump_align. */
604 8, /* loop_align. */
605 2, /* int_reassoc_width. */
606 4, /* fp_reassoc_width. */
607 1, /* vec_reassoc_width. */
608 2, /* min_div_recip_mul_sf. */
609 2, /* min_div_recip_mul_df. */
610 0, /* max_case_values. */
611 0, /* cache_line_size. */
612 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
613 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
616 static const struct tune_params cortexa72_tunings =
618 &cortexa57_extra_costs,
619 &cortexa57_addrcost_table,
620 &cortexa57_regmove_cost,
621 &cortexa57_vector_cost,
622 &cortexa57_branch_cost,
623 &generic_approx_modes,
624 4, /* memmov_cost */
625 3, /* issue_rate */
626 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
627 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
628 16, /* function_align. */
629 8, /* jump_align. */
630 8, /* loop_align. */
631 2, /* int_reassoc_width. */
632 4, /* fp_reassoc_width. */
633 1, /* vec_reassoc_width. */
634 2, /* min_div_recip_mul_sf. */
635 2, /* min_div_recip_mul_df. */
636 0, /* max_case_values. */
637 0, /* cache_line_size. */
638 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
639 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
642 static const struct tune_params cortexa73_tunings =
644 &cortexa57_extra_costs,
645 &cortexa57_addrcost_table,
646 &cortexa57_regmove_cost,
647 &cortexa57_vector_cost,
648 &cortexa57_branch_cost,
649 &generic_approx_modes,
650 4, /* memmov_cost. */
651 2, /* issue_rate. */
652 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
653 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
654 16, /* function_align. */
655 8, /* jump_align. */
656 8, /* loop_align. */
657 2, /* int_reassoc_width. */
658 4, /* fp_reassoc_width. */
659 1, /* vec_reassoc_width. */
660 2, /* min_div_recip_mul_sf. */
661 2, /* min_div_recip_mul_df. */
662 0, /* max_case_values. */
663 0, /* cache_line_size. */
664 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
665 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
668 static const struct tune_params exynosm1_tunings =
670 &exynosm1_extra_costs,
671 &exynosm1_addrcost_table,
672 &exynosm1_regmove_cost,
673 &exynosm1_vector_cost,
674 &generic_branch_cost,
675 &exynosm1_approx_modes,
676 4, /* memmov_cost */
677 3, /* issue_rate */
678 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
679 4, /* function_align. */
680 4, /* jump_align. */
681 4, /* loop_align. */
682 2, /* int_reassoc_width. */
683 4, /* fp_reassoc_width. */
684 1, /* vec_reassoc_width. */
685 2, /* min_div_recip_mul_sf. */
686 2, /* min_div_recip_mul_df. */
687 48, /* max_case_values. */
688 64, /* cache_line_size. */
689 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
690 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
693 static const struct tune_params thunderx_tunings =
695 &thunderx_extra_costs,
696 &generic_addrcost_table,
697 &thunderx_regmove_cost,
698 &thunderx_vector_cost,
699 &generic_branch_cost,
700 &generic_approx_modes,
701 6, /* memmov_cost */
702 2, /* issue_rate */
703 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
704 8, /* function_align. */
705 8, /* jump_align. */
706 8, /* loop_align. */
707 2, /* int_reassoc_width. */
708 4, /* fp_reassoc_width. */
709 1, /* vec_reassoc_width. */
710 2, /* min_div_recip_mul_sf. */
711 2, /* min_div_recip_mul_df. */
712 0, /* max_case_values. */
713 0, /* cache_line_size. */
714 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
715 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
718 static const struct tune_params xgene1_tunings =
720 &xgene1_extra_costs,
721 &xgene1_addrcost_table,
722 &xgene1_regmove_cost,
723 &xgene1_vector_cost,
724 &generic_branch_cost,
725 &xgene1_approx_modes,
726 6, /* memmov_cost */
727 4, /* issue_rate */
728 AARCH64_FUSE_NOTHING, /* fusible_ops */
729 16, /* function_align. */
730 8, /* jump_align. */
731 16, /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 0, /* cache_line_size. */
739 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
743 static const struct tune_params qdf24xx_tunings =
745 &qdf24xx_extra_costs,
746 &qdf24xx_addrcost_table,
747 &qdf24xx_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 4, /* memmov_cost */
752 4, /* issue_rate */
753 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
754 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
755 16, /* function_align. */
756 8, /* jump_align. */
757 16, /* loop_align. */
758 2, /* int_reassoc_width. */
759 4, /* fp_reassoc_width. */
760 1, /* vec_reassoc_width. */
761 2, /* min_div_recip_mul_sf. */
762 2, /* min_div_recip_mul_df. */
763 0, /* max_case_values. */
764 64, /* cache_line_size. */
765 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
769 static const struct tune_params vulcan_tunings =
771 &vulcan_extra_costs,
772 &vulcan_addrcost_table,
773 &vulcan_regmove_cost,
774 &vulcan_vector_cost,
775 &vulcan_branch_cost,
776 &generic_approx_modes,
777 4, /* memmov_cost. */
778 4, /* issue_rate. */
779 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
780 16, /* function_align. */
781 8, /* jump_align. */
782 16, /* loop_align. */
783 3, /* int_reassoc_width. */
784 2, /* fp_reassoc_width. */
785 2, /* vec_reassoc_width. */
786 2, /* min_div_recip_mul_sf. */
787 2, /* min_div_recip_mul_df. */
788 0, /* max_case_values. */
789 64, /* cache_line_size. */
790 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
791 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
794 /* Support for fine-grained override of the tuning structures. */
795 struct aarch64_tuning_override_function
797 const char* name;
798 void (*parse_override)(const char*, struct tune_params*);
801 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
802 static void aarch64_parse_tune_string (const char*, struct tune_params*);
804 static const struct aarch64_tuning_override_function
805 aarch64_tuning_override_functions[] =
807 { "fuse", aarch64_parse_fuse_string },
808 { "tune", aarch64_parse_tune_string },
809 { NULL, NULL }
812 /* A processor implementing AArch64. */
813 struct processor
815 const char *const name;
816 enum aarch64_processor ident;
817 enum aarch64_processor sched_core;
818 enum aarch64_arch arch;
819 unsigned architecture_version;
820 const unsigned long flags;
821 const struct tune_params *const tune;
824 /* Architectures implementing AArch64. */
825 static const struct processor all_architectures[] =
827 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
828 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
829 #include "aarch64-arches.def"
830 #undef AARCH64_ARCH
831 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
834 /* Processor cores implementing AArch64. */
835 static const struct processor all_cores[] =
837 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
838 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
839 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
840 FLAGS, &COSTS##_tunings},
841 #include "aarch64-cores.def"
842 #undef AARCH64_CORE
843 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
844 AARCH64_FL_FOR_ARCH8, &generic_tunings},
845 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
849 /* Target specification. These are populated by the -march, -mtune, -mcpu
850 handling code or by target attributes. */
851 static const struct processor *selected_arch;
852 static const struct processor *selected_cpu;
853 static const struct processor *selected_tune;
855 /* The current tuning set. */
856 struct tune_params aarch64_tune_params = generic_tunings;
858 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
860 /* An ISA extension in the co-processor and main instruction set space. */
861 struct aarch64_option_extension
863 const char *const name;
864 const unsigned long flags_on;
865 const unsigned long flags_off;
868 typedef enum aarch64_cond_code
870 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
871 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
872 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
874 aarch64_cc;
876 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
878 /* The condition codes of the processor, and the inverse function. */
879 static const char * const aarch64_condition_codes[] =
881 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
882 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
885 /* Generate code to enable conditional branches in functions over 1 MiB. */
886 const char *
887 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
888 const char * branch_format)
890 rtx_code_label * tmp_label = gen_label_rtx ();
891 char label_buf[256];
892 char buffer[128];
893 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
894 CODE_LABEL_NUMBER (tmp_label));
895 const char *label_ptr = targetm.strip_name_encoding (label_buf);
896 rtx dest_label = operands[pos_label];
897 operands[pos_label] = tmp_label;
899 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
900 output_asm_insn (buffer, operands);
902 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
903 operands[pos_label] = dest_label;
904 output_asm_insn (buffer, operands);
905 return "";
908 void
909 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
911 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
912 if (TARGET_GENERAL_REGS_ONLY)
913 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
914 else
915 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
918 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
919 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
920 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
921 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
922 cost (in this case the best class is the lowest cost one). Using ALL_REGS
923 irrespectively of its cost results in bad allocations with many redundant
924 int<->FP moves which are expensive on various cores.
925 To avoid this we don't allow ALL_REGS as the allocno class, but force a
926 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
927 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
928 Otherwise set the allocno class depending on the mode.
929 The result of this is that it is no longer inefficient to have a higher
930 memory move cost than the register move cost.
933 static reg_class_t
934 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
935 reg_class_t best_class)
937 enum machine_mode mode;
939 if (allocno_class != ALL_REGS)
940 return allocno_class;
942 if (best_class != ALL_REGS)
943 return best_class;
945 mode = PSEUDO_REGNO_MODE (regno);
946 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
949 static unsigned int
950 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
952 if (GET_MODE_UNIT_SIZE (mode) == 4)
953 return aarch64_tune_params.min_div_recip_mul_sf;
954 return aarch64_tune_params.min_div_recip_mul_df;
957 static int
958 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
959 enum machine_mode mode)
961 if (VECTOR_MODE_P (mode))
962 return aarch64_tune_params.vec_reassoc_width;
963 if (INTEGRAL_MODE_P (mode))
964 return aarch64_tune_params.int_reassoc_width;
965 if (FLOAT_MODE_P (mode))
966 return aarch64_tune_params.fp_reassoc_width;
967 return 1;
970 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
971 unsigned
972 aarch64_dbx_register_number (unsigned regno)
974 if (GP_REGNUM_P (regno))
975 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
976 else if (regno == SP_REGNUM)
977 return AARCH64_DWARF_SP;
978 else if (FP_REGNUM_P (regno))
979 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
981 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
982 equivalent DWARF register. */
983 return DWARF_FRAME_REGISTERS;
986 /* Return TRUE if MODE is any of the large INT modes. */
987 static bool
988 aarch64_vect_struct_mode_p (machine_mode mode)
990 return mode == OImode || mode == CImode || mode == XImode;
993 /* Return TRUE if MODE is any of the vector modes. */
994 static bool
995 aarch64_vector_mode_p (machine_mode mode)
997 return aarch64_vector_mode_supported_p (mode)
998 || aarch64_vect_struct_mode_p (mode);
1001 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1002 static bool
1003 aarch64_array_mode_supported_p (machine_mode mode,
1004 unsigned HOST_WIDE_INT nelems)
1006 if (TARGET_SIMD
1007 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1008 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1009 && (nelems >= 2 && nelems <= 4))
1010 return true;
1012 return false;
1015 /* Implement HARD_REGNO_NREGS. */
1018 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1020 switch (aarch64_regno_regclass (regno))
1022 case FP_REGS:
1023 case FP_LO_REGS:
1024 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1025 default:
1026 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1028 gcc_unreachable ();
1031 /* Implement HARD_REGNO_MODE_OK. */
1034 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1036 if (GET_MODE_CLASS (mode) == MODE_CC)
1037 return regno == CC_REGNUM;
1039 if (regno == SP_REGNUM)
1040 /* The purpose of comparing with ptr_mode is to support the
1041 global register variable associated with the stack pointer
1042 register via the syntax of asm ("wsp") in ILP32. */
1043 return mode == Pmode || mode == ptr_mode;
1045 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1046 return mode == Pmode;
1048 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1049 return 1;
1051 if (FP_REGNUM_P (regno))
1053 if (aarch64_vect_struct_mode_p (mode))
1054 return
1055 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1056 else
1057 return 1;
1060 return 0;
1063 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1064 machine_mode
1065 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1066 machine_mode mode)
1068 /* Handle modes that fit within single registers. */
1069 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1071 if (GET_MODE_SIZE (mode) >= 4)
1072 return mode;
1073 else
1074 return SImode;
1076 /* Fall back to generic for multi-reg and very large modes. */
1077 else
1078 return choose_hard_reg_mode (regno, nregs, false);
1081 /* Return true if calls to DECL should be treated as
1082 long-calls (ie called via a register). */
1083 static bool
1084 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1086 return false;
1089 /* Return true if calls to symbol-ref SYM should be treated as
1090 long-calls (ie called via a register). */
1091 bool
1092 aarch64_is_long_call_p (rtx sym)
1094 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1097 /* Return true if calls to symbol-ref SYM should not go through
1098 plt stubs. */
1100 bool
1101 aarch64_is_noplt_call_p (rtx sym)
1103 const_tree decl = SYMBOL_REF_DECL (sym);
1105 if (flag_pic
1106 && decl
1107 && (!flag_plt
1108 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1109 && !targetm.binds_local_p (decl))
1110 return true;
1112 return false;
1115 /* Return true if the offsets to a zero/sign-extract operation
1116 represent an expression that matches an extend operation. The
1117 operands represent the paramters from
1119 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1120 bool
1121 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1122 rtx extract_imm)
1124 HOST_WIDE_INT mult_val, extract_val;
1126 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1127 return false;
1129 mult_val = INTVAL (mult_imm);
1130 extract_val = INTVAL (extract_imm);
1132 if (extract_val > 8
1133 && extract_val < GET_MODE_BITSIZE (mode)
1134 && exact_log2 (extract_val & ~7) > 0
1135 && (extract_val & 7) <= 4
1136 && mult_val == (1 << (extract_val & 7)))
1137 return true;
1139 return false;
1142 /* Emit an insn that's a simple single-set. Both the operands must be
1143 known to be valid. */
1144 inline static rtx
1145 emit_set_insn (rtx x, rtx y)
1147 return emit_insn (gen_rtx_SET (x, y));
1150 /* X and Y are two things to compare using CODE. Emit the compare insn and
1151 return the rtx for register 0 in the proper mode. */
1153 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1155 machine_mode mode = SELECT_CC_MODE (code, x, y);
1156 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1158 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1159 return cc_reg;
1162 /* Build the SYMBOL_REF for __tls_get_addr. */
1164 static GTY(()) rtx tls_get_addr_libfunc;
1167 aarch64_tls_get_addr (void)
1169 if (!tls_get_addr_libfunc)
1170 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1171 return tls_get_addr_libfunc;
1174 /* Return the TLS model to use for ADDR. */
1176 static enum tls_model
1177 tls_symbolic_operand_type (rtx addr)
1179 enum tls_model tls_kind = TLS_MODEL_NONE;
1180 rtx sym, addend;
1182 if (GET_CODE (addr) == CONST)
1184 split_const (addr, &sym, &addend);
1185 if (GET_CODE (sym) == SYMBOL_REF)
1186 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1188 else if (GET_CODE (addr) == SYMBOL_REF)
1189 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1191 return tls_kind;
1194 /* We'll allow lo_sum's in addresses in our legitimate addresses
1195 so that combine would take care of combining addresses where
1196 necessary, but for generation purposes, we'll generate the address
1197 as :
1198 RTL Absolute
1199 tmp = hi (symbol_ref); adrp x1, foo
1200 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1203 PIC TLS
1204 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1205 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1206 bl __tls_get_addr
1209 Load TLS symbol, depending on TLS mechanism and TLS access model.
1211 Global Dynamic - Traditional TLS:
1212 adrp tmp, :tlsgd:imm
1213 add dest, tmp, #:tlsgd_lo12:imm
1214 bl __tls_get_addr
1216 Global Dynamic - TLS Descriptors:
1217 adrp dest, :tlsdesc:imm
1218 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1219 add dest, dest, #:tlsdesc_lo12:imm
1220 blr tmp
1221 mrs tp, tpidr_el0
1222 add dest, dest, tp
1224 Initial Exec:
1225 mrs tp, tpidr_el0
1226 adrp tmp, :gottprel:imm
1227 ldr dest, [tmp, #:gottprel_lo12:imm]
1228 add dest, dest, tp
1230 Local Exec:
1231 mrs tp, tpidr_el0
1232 add t0, tp, #:tprel_hi12:imm, lsl #12
1233 add t0, t0, #:tprel_lo12_nc:imm
1236 static void
1237 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1238 enum aarch64_symbol_type type)
1240 switch (type)
1242 case SYMBOL_SMALL_ABSOLUTE:
1244 /* In ILP32, the mode of dest can be either SImode or DImode. */
1245 rtx tmp_reg = dest;
1246 machine_mode mode = GET_MODE (dest);
1248 gcc_assert (mode == Pmode || mode == ptr_mode);
1250 if (can_create_pseudo_p ())
1251 tmp_reg = gen_reg_rtx (mode);
1253 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1254 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1255 return;
1258 case SYMBOL_TINY_ABSOLUTE:
1259 emit_insn (gen_rtx_SET (dest, imm));
1260 return;
1262 case SYMBOL_SMALL_GOT_28K:
1264 machine_mode mode = GET_MODE (dest);
1265 rtx gp_rtx = pic_offset_table_rtx;
1266 rtx insn;
1267 rtx mem;
1269 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1270 here before rtl expand. Tree IVOPT will generate rtl pattern to
1271 decide rtx costs, in which case pic_offset_table_rtx is not
1272 initialized. For that case no need to generate the first adrp
1273 instruction as the final cost for global variable access is
1274 one instruction. */
1275 if (gp_rtx != NULL)
1277 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1278 using the page base as GOT base, the first page may be wasted,
1279 in the worst scenario, there is only 28K space for GOT).
1281 The generate instruction sequence for accessing global variable
1284 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1286 Only one instruction needed. But we must initialize
1287 pic_offset_table_rtx properly. We generate initialize insn for
1288 every global access, and allow CSE to remove all redundant.
1290 The final instruction sequences will look like the following
1291 for multiply global variables access.
1293 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1295 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1296 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1297 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1298 ... */
1300 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1301 crtl->uses_pic_offset_table = 1;
1302 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1304 if (mode != GET_MODE (gp_rtx))
1305 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1308 if (mode == ptr_mode)
1310 if (mode == DImode)
1311 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1312 else
1313 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1315 mem = XVECEXP (SET_SRC (insn), 0, 0);
1317 else
1319 gcc_assert (mode == Pmode);
1321 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1322 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1325 /* The operand is expected to be MEM. Whenever the related insn
1326 pattern changed, above code which calculate mem should be
1327 updated. */
1328 gcc_assert (GET_CODE (mem) == MEM);
1329 MEM_READONLY_P (mem) = 1;
1330 MEM_NOTRAP_P (mem) = 1;
1331 emit_insn (insn);
1332 return;
1335 case SYMBOL_SMALL_GOT_4G:
1337 /* In ILP32, the mode of dest can be either SImode or DImode,
1338 while the got entry is always of SImode size. The mode of
1339 dest depends on how dest is used: if dest is assigned to a
1340 pointer (e.g. in the memory), it has SImode; it may have
1341 DImode if dest is dereferenced to access the memeory.
1342 This is why we have to handle three different ldr_got_small
1343 patterns here (two patterns for ILP32). */
1345 rtx insn;
1346 rtx mem;
1347 rtx tmp_reg = dest;
1348 machine_mode mode = GET_MODE (dest);
1350 if (can_create_pseudo_p ())
1351 tmp_reg = gen_reg_rtx (mode);
1353 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1354 if (mode == ptr_mode)
1356 if (mode == DImode)
1357 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1358 else
1359 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1361 mem = XVECEXP (SET_SRC (insn), 0, 0);
1363 else
1365 gcc_assert (mode == Pmode);
1367 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1368 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1371 gcc_assert (GET_CODE (mem) == MEM);
1372 MEM_READONLY_P (mem) = 1;
1373 MEM_NOTRAP_P (mem) = 1;
1374 emit_insn (insn);
1375 return;
1378 case SYMBOL_SMALL_TLSGD:
1380 rtx_insn *insns;
1381 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1383 start_sequence ();
1384 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1385 insns = get_insns ();
1386 end_sequence ();
1388 RTL_CONST_CALL_P (insns) = 1;
1389 emit_libcall_block (insns, dest, result, imm);
1390 return;
1393 case SYMBOL_SMALL_TLSDESC:
1395 machine_mode mode = GET_MODE (dest);
1396 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1397 rtx tp;
1399 gcc_assert (mode == Pmode || mode == ptr_mode);
1401 /* In ILP32, the got entry is always of SImode size. Unlike
1402 small GOT, the dest is fixed at reg 0. */
1403 if (TARGET_ILP32)
1404 emit_insn (gen_tlsdesc_small_si (imm));
1405 else
1406 emit_insn (gen_tlsdesc_small_di (imm));
1407 tp = aarch64_load_tp (NULL);
1409 if (mode != Pmode)
1410 tp = gen_lowpart (mode, tp);
1412 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1413 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1414 return;
1417 case SYMBOL_SMALL_TLSIE:
1419 /* In ILP32, the mode of dest can be either SImode or DImode,
1420 while the got entry is always of SImode size. The mode of
1421 dest depends on how dest is used: if dest is assigned to a
1422 pointer (e.g. in the memory), it has SImode; it may have
1423 DImode if dest is dereferenced to access the memeory.
1424 This is why we have to handle three different tlsie_small
1425 patterns here (two patterns for ILP32). */
1426 machine_mode mode = GET_MODE (dest);
1427 rtx tmp_reg = gen_reg_rtx (mode);
1428 rtx tp = aarch64_load_tp (NULL);
1430 if (mode == ptr_mode)
1432 if (mode == DImode)
1433 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1434 else
1436 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1437 tp = gen_lowpart (mode, tp);
1440 else
1442 gcc_assert (mode == Pmode);
1443 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1446 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1447 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1448 return;
1451 case SYMBOL_TLSLE12:
1452 case SYMBOL_TLSLE24:
1453 case SYMBOL_TLSLE32:
1454 case SYMBOL_TLSLE48:
1456 machine_mode mode = GET_MODE (dest);
1457 rtx tp = aarch64_load_tp (NULL);
1459 if (mode != Pmode)
1460 tp = gen_lowpart (mode, tp);
1462 switch (type)
1464 case SYMBOL_TLSLE12:
1465 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1466 (dest, tp, imm));
1467 break;
1468 case SYMBOL_TLSLE24:
1469 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1470 (dest, tp, imm));
1471 break;
1472 case SYMBOL_TLSLE32:
1473 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1474 (dest, imm));
1475 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1476 (dest, dest, tp));
1477 break;
1478 case SYMBOL_TLSLE48:
1479 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1480 (dest, imm));
1481 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1482 (dest, dest, tp));
1483 break;
1484 default:
1485 gcc_unreachable ();
1488 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1489 return;
1492 case SYMBOL_TINY_GOT:
1493 emit_insn (gen_ldr_got_tiny (dest, imm));
1494 return;
1496 case SYMBOL_TINY_TLSIE:
1498 machine_mode mode = GET_MODE (dest);
1499 rtx tp = aarch64_load_tp (NULL);
1501 if (mode == ptr_mode)
1503 if (mode == DImode)
1504 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1505 else
1507 tp = gen_lowpart (mode, tp);
1508 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1511 else
1513 gcc_assert (mode == Pmode);
1514 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1517 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518 return;
1521 default:
1522 gcc_unreachable ();
1526 /* Emit a move from SRC to DEST. Assume that the move expanders can
1527 handle all moves if !can_create_pseudo_p (). The distinction is
1528 important because, unlike emit_move_insn, the move expanders know
1529 how to force Pmode objects into the constant pool even when the
1530 constant pool address is not itself legitimate. */
1531 static rtx
1532 aarch64_emit_move (rtx dest, rtx src)
1534 return (can_create_pseudo_p ()
1535 ? emit_move_insn (dest, src)
1536 : emit_move_insn_1 (dest, src));
1539 /* Split a 128-bit move operation into two 64-bit move operations,
1540 taking care to handle partial overlap of register to register
1541 copies. Special cases are needed when moving between GP regs and
1542 FP regs. SRC can be a register, constant or memory; DST a register
1543 or memory. If either operand is memory it must not have any side
1544 effects. */
1545 void
1546 aarch64_split_128bit_move (rtx dst, rtx src)
1548 rtx dst_lo, dst_hi;
1549 rtx src_lo, src_hi;
1551 machine_mode mode = GET_MODE (dst);
1553 gcc_assert (mode == TImode || mode == TFmode);
1554 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1555 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1557 if (REG_P (dst) && REG_P (src))
1559 int src_regno = REGNO (src);
1560 int dst_regno = REGNO (dst);
1562 /* Handle FP <-> GP regs. */
1563 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1565 src_lo = gen_lowpart (word_mode, src);
1566 src_hi = gen_highpart (word_mode, src);
1568 if (mode == TImode)
1570 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1571 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1573 else
1575 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1576 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1578 return;
1580 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1582 dst_lo = gen_lowpart (word_mode, dst);
1583 dst_hi = gen_highpart (word_mode, dst);
1585 if (mode == TImode)
1587 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1588 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1590 else
1592 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1593 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1595 return;
1599 dst_lo = gen_lowpart (word_mode, dst);
1600 dst_hi = gen_highpart (word_mode, dst);
1601 src_lo = gen_lowpart (word_mode, src);
1602 src_hi = gen_highpart_mode (word_mode, mode, src);
1604 /* At most one pairing may overlap. */
1605 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1607 aarch64_emit_move (dst_hi, src_hi);
1608 aarch64_emit_move (dst_lo, src_lo);
1610 else
1612 aarch64_emit_move (dst_lo, src_lo);
1613 aarch64_emit_move (dst_hi, src_hi);
1617 bool
1618 aarch64_split_128bit_move_p (rtx dst, rtx src)
1620 return (! REG_P (src)
1621 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1624 /* Split a complex SIMD combine. */
1626 void
1627 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1629 machine_mode src_mode = GET_MODE (src1);
1630 machine_mode dst_mode = GET_MODE (dst);
1632 gcc_assert (VECTOR_MODE_P (dst_mode));
1634 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1636 rtx (*gen) (rtx, rtx, rtx);
1638 switch (src_mode)
1640 case V8QImode:
1641 gen = gen_aarch64_simd_combinev8qi;
1642 break;
1643 case V4HImode:
1644 gen = gen_aarch64_simd_combinev4hi;
1645 break;
1646 case V2SImode:
1647 gen = gen_aarch64_simd_combinev2si;
1648 break;
1649 case V4HFmode:
1650 gen = gen_aarch64_simd_combinev4hf;
1651 break;
1652 case V2SFmode:
1653 gen = gen_aarch64_simd_combinev2sf;
1654 break;
1655 case DImode:
1656 gen = gen_aarch64_simd_combinedi;
1657 break;
1658 case DFmode:
1659 gen = gen_aarch64_simd_combinedf;
1660 break;
1661 default:
1662 gcc_unreachable ();
1665 emit_insn (gen (dst, src1, src2));
1666 return;
1670 /* Split a complex SIMD move. */
1672 void
1673 aarch64_split_simd_move (rtx dst, rtx src)
1675 machine_mode src_mode = GET_MODE (src);
1676 machine_mode dst_mode = GET_MODE (dst);
1678 gcc_assert (VECTOR_MODE_P (dst_mode));
1680 if (REG_P (dst) && REG_P (src))
1682 rtx (*gen) (rtx, rtx);
1684 gcc_assert (VECTOR_MODE_P (src_mode));
1686 switch (src_mode)
1688 case V16QImode:
1689 gen = gen_aarch64_split_simd_movv16qi;
1690 break;
1691 case V8HImode:
1692 gen = gen_aarch64_split_simd_movv8hi;
1693 break;
1694 case V4SImode:
1695 gen = gen_aarch64_split_simd_movv4si;
1696 break;
1697 case V2DImode:
1698 gen = gen_aarch64_split_simd_movv2di;
1699 break;
1700 case V8HFmode:
1701 gen = gen_aarch64_split_simd_movv8hf;
1702 break;
1703 case V4SFmode:
1704 gen = gen_aarch64_split_simd_movv4sf;
1705 break;
1706 case V2DFmode:
1707 gen = gen_aarch64_split_simd_movv2df;
1708 break;
1709 default:
1710 gcc_unreachable ();
1713 emit_insn (gen (dst, src));
1714 return;
1718 bool
1719 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1720 machine_mode ymode, rtx y)
1722 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1723 gcc_assert (r != NULL);
1724 return rtx_equal_p (x, r);
1728 static rtx
1729 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1731 if (can_create_pseudo_p ())
1732 return force_reg (mode, value);
1733 else
1735 x = aarch64_emit_move (x, value);
1736 return x;
1741 static rtx
1742 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1744 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1746 rtx high;
1747 /* Load the full offset into a register. This
1748 might be improvable in the future. */
1749 high = GEN_INT (offset);
1750 offset = 0;
1751 high = aarch64_force_temporary (mode, temp, high);
1752 reg = aarch64_force_temporary (mode, temp,
1753 gen_rtx_PLUS (mode, high, reg));
1755 return plus_constant (mode, reg, offset);
1758 static int
1759 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1760 machine_mode mode)
1762 int i;
1763 unsigned HOST_WIDE_INT val, val2, mask;
1764 int one_match, zero_match;
1765 int num_insns;
1767 val = INTVAL (imm);
1769 if (aarch64_move_imm (val, mode))
1771 if (generate)
1772 emit_insn (gen_rtx_SET (dest, imm));
1773 return 1;
1776 if ((val >> 32) == 0 || mode == SImode)
1778 if (generate)
1780 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1781 if (mode == SImode)
1782 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1783 GEN_INT ((val >> 16) & 0xffff)));
1784 else
1785 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1786 GEN_INT ((val >> 16) & 0xffff)));
1788 return 2;
1791 /* Remaining cases are all for DImode. */
1793 mask = 0xffff;
1794 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1795 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1796 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1797 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1799 if (zero_match != 2 && one_match != 2)
1801 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1802 For a 64-bit bitmask try whether changing 16 bits to all ones or
1803 zeroes creates a valid bitmask. To check any repeated bitmask,
1804 try using 16 bits from the other 32-bit half of val. */
1806 for (i = 0; i < 64; i += 16, mask <<= 16)
1808 val2 = val & ~mask;
1809 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1810 break;
1811 val2 = val | mask;
1812 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813 break;
1814 val2 = val2 & ~mask;
1815 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1816 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817 break;
1819 if (i != 64)
1821 if (generate)
1823 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1824 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1825 GEN_INT ((val >> i) & 0xffff)));
1827 return 2;
1831 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1832 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1833 otherwise skip zero bits. */
1835 num_insns = 1;
1836 mask = 0xffff;
1837 val2 = one_match > zero_match ? ~val : val;
1838 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1840 if (generate)
1841 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1842 ? (val | ~(mask << i))
1843 : (val & (mask << i)))));
1844 for (i += 16; i < 64; i += 16)
1846 if ((val2 & (mask << i)) == 0)
1847 continue;
1848 if (generate)
1849 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1850 GEN_INT ((val >> i) & 0xffff)));
1851 num_insns ++;
1854 return num_insns;
1858 void
1859 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1861 machine_mode mode = GET_MODE (dest);
1863 gcc_assert (mode == SImode || mode == DImode);
1865 /* Check on what type of symbol it is. */
1866 if (GET_CODE (imm) == SYMBOL_REF
1867 || GET_CODE (imm) == LABEL_REF
1868 || GET_CODE (imm) == CONST)
1870 rtx mem, base, offset;
1871 enum aarch64_symbol_type sty;
1873 /* If we have (const (plus symbol offset)), separate out the offset
1874 before we start classifying the symbol. */
1875 split_const (imm, &base, &offset);
1877 sty = aarch64_classify_symbol (base, offset);
1878 switch (sty)
1880 case SYMBOL_FORCE_TO_MEM:
1881 if (offset != const0_rtx
1882 && targetm.cannot_force_const_mem (mode, imm))
1884 gcc_assert (can_create_pseudo_p ());
1885 base = aarch64_force_temporary (mode, dest, base);
1886 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1887 aarch64_emit_move (dest, base);
1888 return;
1891 mem = force_const_mem (ptr_mode, imm);
1892 gcc_assert (mem);
1894 /* If we aren't generating PC relative literals, then
1895 we need to expand the literal pool access carefully.
1896 This is something that needs to be done in a number
1897 of places, so could well live as a separate function. */
1898 if (!aarch64_pcrelative_literal_loads)
1900 gcc_assert (can_create_pseudo_p ());
1901 base = gen_reg_rtx (ptr_mode);
1902 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1903 mem = gen_rtx_MEM (ptr_mode, base);
1906 if (mode != ptr_mode)
1907 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1909 emit_insn (gen_rtx_SET (dest, mem));
1911 return;
1913 case SYMBOL_SMALL_TLSGD:
1914 case SYMBOL_SMALL_TLSDESC:
1915 case SYMBOL_SMALL_TLSIE:
1916 case SYMBOL_SMALL_GOT_28K:
1917 case SYMBOL_SMALL_GOT_4G:
1918 case SYMBOL_TINY_GOT:
1919 case SYMBOL_TINY_TLSIE:
1920 if (offset != const0_rtx)
1922 gcc_assert(can_create_pseudo_p ());
1923 base = aarch64_force_temporary (mode, dest, base);
1924 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1925 aarch64_emit_move (dest, base);
1926 return;
1928 /* FALLTHRU */
1930 case SYMBOL_SMALL_ABSOLUTE:
1931 case SYMBOL_TINY_ABSOLUTE:
1932 case SYMBOL_TLSLE12:
1933 case SYMBOL_TLSLE24:
1934 case SYMBOL_TLSLE32:
1935 case SYMBOL_TLSLE48:
1936 aarch64_load_symref_appropriately (dest, imm, sty);
1937 return;
1939 default:
1940 gcc_unreachable ();
1944 if (!CONST_INT_P (imm))
1946 if (GET_CODE (imm) == HIGH)
1947 emit_insn (gen_rtx_SET (dest, imm));
1948 else
1950 rtx mem = force_const_mem (mode, imm);
1951 gcc_assert (mem);
1952 emit_insn (gen_rtx_SET (dest, mem));
1955 return;
1958 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1961 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to held
1962 intermediate value if necessary.
1964 This function is sometimes used to adjust the stack pointer, so we must
1965 ensure that it can never cause transient stack deallocation by writing an
1966 invalid value into REGNUM. */
1968 static void
1969 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
1970 HOST_WIDE_INT delta, bool frame_related_p)
1972 HOST_WIDE_INT mdelta = abs_hwi (delta);
1973 rtx this_rtx = gen_rtx_REG (mode, regnum);
1974 rtx_insn *insn;
1976 /* Do nothing if mdelta is zero. */
1977 if (!mdelta)
1978 return;
1980 /* We only need single instruction if the offset fit into add/sub. */
1981 if (aarch64_uimm12_shift (mdelta))
1983 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1984 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1985 return;
1988 /* We need two add/sub instructions, each one performing part of the
1989 calculation. Don't do this if the addend can be loaded into register with
1990 a single instruction, in that case we prefer a move to a scratch register
1991 following by an addition. */
1992 if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
1994 HOST_WIDE_INT low_off = mdelta & 0xfff;
1996 low_off = delta < 0 ? -low_off : low_off;
1997 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001 return;
2004 /* Otherwise use generic function to handle all other situations. */
2005 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
2007 insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
2008 if (frame_related_p)
2010 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2011 rtx adj = plus_constant (mode, this_rtx, delta);
2012 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2016 static bool
2017 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2018 tree exp ATTRIBUTE_UNUSED)
2020 /* Currently, always true. */
2021 return true;
2024 /* Implement TARGET_PASS_BY_REFERENCE. */
2026 static bool
2027 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2028 machine_mode mode,
2029 const_tree type,
2030 bool named ATTRIBUTE_UNUSED)
2032 HOST_WIDE_INT size;
2033 machine_mode dummymode;
2034 int nregs;
2036 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2037 size = (mode == BLKmode && type)
2038 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2040 /* Aggregates are passed by reference based on their size. */
2041 if (type && AGGREGATE_TYPE_P (type))
2043 size = int_size_in_bytes (type);
2046 /* Variable sized arguments are always returned by reference. */
2047 if (size < 0)
2048 return true;
2050 /* Can this be a candidate to be passed in fp/simd register(s)? */
2051 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2052 &dummymode, &nregs,
2053 NULL))
2054 return false;
2056 /* Arguments which are variable sized or larger than 2 registers are
2057 passed by reference unless they are a homogenous floating point
2058 aggregate. */
2059 return size > 2 * UNITS_PER_WORD;
2062 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2063 static bool
2064 aarch64_return_in_msb (const_tree valtype)
2066 machine_mode dummy_mode;
2067 int dummy_int;
2069 /* Never happens in little-endian mode. */
2070 if (!BYTES_BIG_ENDIAN)
2071 return false;
2073 /* Only composite types smaller than or equal to 16 bytes can
2074 be potentially returned in registers. */
2075 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2076 || int_size_in_bytes (valtype) <= 0
2077 || int_size_in_bytes (valtype) > 16)
2078 return false;
2080 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2081 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2082 is always passed/returned in the least significant bits of fp/simd
2083 register(s). */
2084 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2085 &dummy_mode, &dummy_int, NULL))
2086 return false;
2088 return true;
2091 /* Implement TARGET_FUNCTION_VALUE.
2092 Define how to find the value returned by a function. */
2094 static rtx
2095 aarch64_function_value (const_tree type, const_tree func,
2096 bool outgoing ATTRIBUTE_UNUSED)
2098 machine_mode mode;
2099 int unsignedp;
2100 int count;
2101 machine_mode ag_mode;
2103 mode = TYPE_MODE (type);
2104 if (INTEGRAL_TYPE_P (type))
2105 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2107 if (aarch64_return_in_msb (type))
2109 HOST_WIDE_INT size = int_size_in_bytes (type);
2111 if (size % UNITS_PER_WORD != 0)
2113 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2114 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2118 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2119 &ag_mode, &count, NULL))
2121 if (!aarch64_composite_type_p (type, mode))
2123 gcc_assert (count == 1 && mode == ag_mode);
2124 return gen_rtx_REG (mode, V0_REGNUM);
2126 else
2128 int i;
2129 rtx par;
2131 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2132 for (i = 0; i < count; i++)
2134 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2135 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2136 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2137 XVECEXP (par, 0, i) = tmp;
2139 return par;
2142 else
2143 return gen_rtx_REG (mode, R0_REGNUM);
2146 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2147 Return true if REGNO is the number of a hard register in which the values
2148 of called function may come back. */
2150 static bool
2151 aarch64_function_value_regno_p (const unsigned int regno)
2153 /* Maximum of 16 bytes can be returned in the general registers. Examples
2154 of 16-byte return values are: 128-bit integers and 16-byte small
2155 structures (excluding homogeneous floating-point aggregates). */
2156 if (regno == R0_REGNUM || regno == R1_REGNUM)
2157 return true;
2159 /* Up to four fp/simd registers can return a function value, e.g. a
2160 homogeneous floating-point aggregate having four members. */
2161 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2162 return TARGET_FLOAT;
2164 return false;
2167 /* Implement TARGET_RETURN_IN_MEMORY.
2169 If the type T of the result of a function is such that
2170 void func (T arg)
2171 would require that arg be passed as a value in a register (or set of
2172 registers) according to the parameter passing rules, then the result
2173 is returned in the same registers as would be used for such an
2174 argument. */
2176 static bool
2177 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2179 HOST_WIDE_INT size;
2180 machine_mode ag_mode;
2181 int count;
2183 if (!AGGREGATE_TYPE_P (type)
2184 && TREE_CODE (type) != COMPLEX_TYPE
2185 && TREE_CODE (type) != VECTOR_TYPE)
2186 /* Simple scalar types always returned in registers. */
2187 return false;
2189 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2190 type,
2191 &ag_mode,
2192 &count,
2193 NULL))
2194 return false;
2196 /* Types larger than 2 registers returned in memory. */
2197 size = int_size_in_bytes (type);
2198 return (size < 0 || size > 2 * UNITS_PER_WORD);
2201 static bool
2202 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2203 const_tree type, int *nregs)
2205 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2206 return aarch64_vfp_is_call_or_return_candidate (mode,
2207 type,
2208 &pcum->aapcs_vfp_rmode,
2209 nregs,
2210 NULL);
2213 /* Given MODE and TYPE of a function argument, return the alignment in
2214 bits. The idea is to suppress any stronger alignment requested by
2215 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2216 This is a helper function for local use only. */
2218 static unsigned int
2219 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2221 if (!type)
2222 return GET_MODE_ALIGNMENT (mode);
2223 if (integer_zerop (TYPE_SIZE (type)))
2224 return 0;
2226 gcc_assert (TYPE_MODE (type) == mode);
2228 if (!AGGREGATE_TYPE_P (type))
2229 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2231 if (TREE_CODE (type) == ARRAY_TYPE)
2232 return TYPE_ALIGN (TREE_TYPE (type));
2234 unsigned int alignment = 0;
2236 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2237 alignment = std::max (alignment, DECL_ALIGN (field));
2239 return alignment;
2242 /* Layout a function argument according to the AAPCS64 rules. The rule
2243 numbers refer to the rule numbers in the AAPCS64. */
2245 static void
2246 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2247 const_tree type,
2248 bool named ATTRIBUTE_UNUSED)
2250 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2251 int ncrn, nvrn, nregs;
2252 bool allocate_ncrn, allocate_nvrn;
2253 HOST_WIDE_INT size;
2255 /* We need to do this once per argument. */
2256 if (pcum->aapcs_arg_processed)
2257 return;
2259 pcum->aapcs_arg_processed = true;
2261 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2262 size
2263 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2264 UNITS_PER_WORD);
2266 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2267 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2268 mode,
2269 type,
2270 &nregs);
2272 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2273 The following code thus handles passing by SIMD/FP registers first. */
2275 nvrn = pcum->aapcs_nvrn;
2277 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2278 and homogenous short-vector aggregates (HVA). */
2279 if (allocate_nvrn)
2281 if (!TARGET_FLOAT)
2282 aarch64_err_no_fpadvsimd (mode, "argument");
2284 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2286 pcum->aapcs_nextnvrn = nvrn + nregs;
2287 if (!aarch64_composite_type_p (type, mode))
2289 gcc_assert (nregs == 1);
2290 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2292 else
2294 rtx par;
2295 int i;
2296 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2297 for (i = 0; i < nregs; i++)
2299 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2300 V0_REGNUM + nvrn + i);
2301 tmp = gen_rtx_EXPR_LIST
2302 (VOIDmode, tmp,
2303 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2304 XVECEXP (par, 0, i) = tmp;
2306 pcum->aapcs_reg = par;
2308 return;
2310 else
2312 /* C.3 NSRN is set to 8. */
2313 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2314 goto on_stack;
2318 ncrn = pcum->aapcs_ncrn;
2319 nregs = size / UNITS_PER_WORD;
2321 /* C6 - C9. though the sign and zero extension semantics are
2322 handled elsewhere. This is the case where the argument fits
2323 entirely general registers. */
2324 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2326 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2328 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2330 /* C.8 if the argument has an alignment of 16 then the NGRN is
2331 rounded up to the next even number. */
2332 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2334 ++ncrn;
2335 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2337 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2338 A reg is still generated for it, but the caller should be smart
2339 enough not to use it. */
2340 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2342 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2344 else
2346 rtx par;
2347 int i;
2349 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2350 for (i = 0; i < nregs; i++)
2352 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2353 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2354 GEN_INT (i * UNITS_PER_WORD));
2355 XVECEXP (par, 0, i) = tmp;
2357 pcum->aapcs_reg = par;
2360 pcum->aapcs_nextncrn = ncrn + nregs;
2361 return;
2364 /* C.11 */
2365 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2367 /* The argument is passed on stack; record the needed number of words for
2368 this argument and align the total size if necessary. */
2369 on_stack:
2370 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2371 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2372 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2373 16 / UNITS_PER_WORD);
2374 return;
2377 /* Implement TARGET_FUNCTION_ARG. */
2379 static rtx
2380 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2381 const_tree type, bool named)
2383 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2384 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2386 if (mode == VOIDmode)
2387 return NULL_RTX;
2389 aarch64_layout_arg (pcum_v, mode, type, named);
2390 return pcum->aapcs_reg;
2393 void
2394 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2395 const_tree fntype ATTRIBUTE_UNUSED,
2396 rtx libname ATTRIBUTE_UNUSED,
2397 const_tree fndecl ATTRIBUTE_UNUSED,
2398 unsigned n_named ATTRIBUTE_UNUSED)
2400 pcum->aapcs_ncrn = 0;
2401 pcum->aapcs_nvrn = 0;
2402 pcum->aapcs_nextncrn = 0;
2403 pcum->aapcs_nextnvrn = 0;
2404 pcum->pcs_variant = ARM_PCS_AAPCS64;
2405 pcum->aapcs_reg = NULL_RTX;
2406 pcum->aapcs_arg_processed = false;
2407 pcum->aapcs_stack_words = 0;
2408 pcum->aapcs_stack_size = 0;
2410 if (!TARGET_FLOAT
2411 && fndecl && TREE_PUBLIC (fndecl)
2412 && fntype && fntype != error_mark_node)
2414 const_tree type = TREE_TYPE (fntype);
2415 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2416 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2417 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2418 &mode, &nregs, NULL))
2419 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2421 return;
2424 static void
2425 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2426 machine_mode mode,
2427 const_tree type,
2428 bool named)
2430 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2431 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2433 aarch64_layout_arg (pcum_v, mode, type, named);
2434 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2435 != (pcum->aapcs_stack_words != 0));
2436 pcum->aapcs_arg_processed = false;
2437 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2438 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2439 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2440 pcum->aapcs_stack_words = 0;
2441 pcum->aapcs_reg = NULL_RTX;
2445 bool
2446 aarch64_function_arg_regno_p (unsigned regno)
2448 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2449 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2452 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2453 PARM_BOUNDARY bits of alignment, but will be given anything up
2454 to STACK_BOUNDARY bits if the type requires it. This makes sure
2455 that both before and after the layout of each argument, the Next
2456 Stacked Argument Address (NSAA) will have a minimum alignment of
2457 8 bytes. */
2459 static unsigned int
2460 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2462 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2464 if (alignment < PARM_BOUNDARY)
2465 alignment = PARM_BOUNDARY;
2466 if (alignment > STACK_BOUNDARY)
2467 alignment = STACK_BOUNDARY;
2468 return alignment;
2471 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2473 Return true if an argument passed on the stack should be padded upwards,
2474 i.e. if the least-significant byte of the stack slot has useful data.
2476 Small aggregate types are placed in the lowest memory address.
2478 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2480 bool
2481 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2483 /* On little-endian targets, the least significant byte of every stack
2484 argument is passed at the lowest byte address of the stack slot. */
2485 if (!BYTES_BIG_ENDIAN)
2486 return true;
2488 /* Otherwise, integral, floating-point and pointer types are padded downward:
2489 the least significant byte of a stack argument is passed at the highest
2490 byte address of the stack slot. */
2491 if (type
2492 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2493 || POINTER_TYPE_P (type))
2494 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2495 return false;
2497 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2498 return true;
2501 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2503 It specifies padding for the last (may also be the only)
2504 element of a block move between registers and memory. If
2505 assuming the block is in the memory, padding upward means that
2506 the last element is padded after its highest significant byte,
2507 while in downward padding, the last element is padded at the
2508 its least significant byte side.
2510 Small aggregates and small complex types are always padded
2511 upwards.
2513 We don't need to worry about homogeneous floating-point or
2514 short-vector aggregates; their move is not affected by the
2515 padding direction determined here. Regardless of endianness,
2516 each element of such an aggregate is put in the least
2517 significant bits of a fp/simd register.
2519 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2520 register has useful data, and return the opposite if the most
2521 significant byte does. */
2523 bool
2524 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2525 bool first ATTRIBUTE_UNUSED)
2528 /* Small composite types are always padded upward. */
2529 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2531 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2532 : GET_MODE_SIZE (mode));
2533 if (size < 2 * UNITS_PER_WORD)
2534 return true;
2537 /* Otherwise, use the default padding. */
2538 return !BYTES_BIG_ENDIAN;
2541 static machine_mode
2542 aarch64_libgcc_cmp_return_mode (void)
2544 return SImode;
2547 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2549 /* We use the 12-bit shifted immediate arithmetic instructions so values
2550 must be multiple of (1 << 12), i.e. 4096. */
2551 #define ARITH_FACTOR 4096
2553 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2554 #error Cannot use simple address calculation for stack probing
2555 #endif
2557 /* The pair of scratch registers used for stack probing. */
2558 #define PROBE_STACK_FIRST_REG 9
2559 #define PROBE_STACK_SECOND_REG 10
2561 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2562 inclusive. These are offsets from the current stack pointer. */
2564 static void
2565 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2567 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2569 /* See the same assertion on PROBE_INTERVAL above. */
2570 gcc_assert ((first % ARITH_FACTOR) == 0);
2572 /* See if we have a constant small number of probes to generate. If so,
2573 that's the easy case. */
2574 if (size <= PROBE_INTERVAL)
2576 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2578 emit_set_insn (reg1,
2579 plus_constant (ptr_mode,
2580 stack_pointer_rtx, -(first + base)));
2581 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2584 /* The run-time loop is made up of 8 insns in the generic case while the
2585 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2586 else if (size <= 4 * PROBE_INTERVAL)
2588 HOST_WIDE_INT i, rem;
2590 emit_set_insn (reg1,
2591 plus_constant (ptr_mode,
2592 stack_pointer_rtx,
2593 -(first + PROBE_INTERVAL)));
2594 emit_stack_probe (reg1);
2596 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2597 it exceeds SIZE. If only two probes are needed, this will not
2598 generate any code. Then probe at FIRST + SIZE. */
2599 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2601 emit_set_insn (reg1,
2602 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2603 emit_stack_probe (reg1);
2606 rem = size - (i - PROBE_INTERVAL);
2607 if (rem > 256)
2609 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2611 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2612 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2614 else
2615 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2618 /* Otherwise, do the same as above, but in a loop. Note that we must be
2619 extra careful with variables wrapping around because we might be at
2620 the very top (or the very bottom) of the address space and we have
2621 to be able to handle this case properly; in particular, we use an
2622 equality test for the loop condition. */
2623 else
2625 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2627 /* Step 1: round SIZE to the previous multiple of the interval. */
2629 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2632 /* Step 2: compute initial and final value of the loop counter. */
2634 /* TEST_ADDR = SP + FIRST. */
2635 emit_set_insn (reg1,
2636 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2638 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2639 emit_set_insn (reg2,
2640 plus_constant (ptr_mode, stack_pointer_rtx,
2641 -(first + rounded_size)));
2644 /* Step 3: the loop
2648 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2649 probe at TEST_ADDR
2651 while (TEST_ADDR != LAST_ADDR)
2653 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2654 until it is equal to ROUNDED_SIZE. */
2656 if (ptr_mode == DImode)
2657 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2658 else
2659 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2662 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2663 that SIZE is equal to ROUNDED_SIZE. */
2665 if (size != rounded_size)
2667 HOST_WIDE_INT rem = size - rounded_size;
2669 if (rem > 256)
2671 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2673 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2674 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2676 else
2677 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2681 /* Make sure nothing is scheduled before we are done. */
2682 emit_insn (gen_blockage ());
2685 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2686 absolute addresses. */
2688 const char *
2689 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2691 static int labelno = 0;
2692 char loop_lab[32];
2693 rtx xops[2];
2695 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2697 /* Loop. */
2698 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2700 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2701 xops[0] = reg1;
2702 xops[1] = GEN_INT (PROBE_INTERVAL);
2703 output_asm_insn ("sub\t%0, %0, %1", xops);
2705 /* Probe at TEST_ADDR. */
2706 output_asm_insn ("str\txzr, [%0]", xops);
2708 /* Test if TEST_ADDR == LAST_ADDR. */
2709 xops[1] = reg2;
2710 output_asm_insn ("cmp\t%0, %1", xops);
2712 /* Branch. */
2713 fputs ("\tb.ne\t", asm_out_file);
2714 assemble_name_raw (asm_out_file, loop_lab);
2715 fputc ('\n', asm_out_file);
2717 return "";
2720 static bool
2721 aarch64_frame_pointer_required (void)
2723 /* In aarch64_override_options_after_change
2724 flag_omit_leaf_frame_pointer turns off the frame pointer by
2725 default. Turn it back on now if we've not got a leaf
2726 function. */
2727 if (flag_omit_leaf_frame_pointer
2728 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2729 return true;
2731 return false;
2734 /* Mark the registers that need to be saved by the callee and calculate
2735 the size of the callee-saved registers area and frame record (both FP
2736 and LR may be omitted). */
2737 static void
2738 aarch64_layout_frame (void)
2740 HOST_WIDE_INT offset = 0;
2741 int regno;
2743 if (reload_completed && cfun->machine->frame.laid_out)
2744 return;
2746 #define SLOT_NOT_REQUIRED (-2)
2747 #define SLOT_REQUIRED (-1)
2749 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2750 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2752 /* First mark all the registers that really need to be saved... */
2753 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2754 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2756 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2757 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2759 /* ... that includes the eh data registers (if needed)... */
2760 if (crtl->calls_eh_return)
2761 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2762 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2763 = SLOT_REQUIRED;
2765 /* ... and any callee saved register that dataflow says is live. */
2766 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2767 if (df_regs_ever_live_p (regno)
2768 && (regno == R30_REGNUM
2769 || !call_used_regs[regno]))
2770 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2772 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2773 if (df_regs_ever_live_p (regno)
2774 && !call_used_regs[regno])
2775 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2777 if (frame_pointer_needed)
2779 /* FP and LR are placed in the linkage record. */
2780 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2781 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2782 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2783 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2784 offset += 2 * UNITS_PER_WORD;
2787 /* Now assign stack slots for them. */
2788 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2789 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2791 cfun->machine->frame.reg_offset[regno] = offset;
2792 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2793 cfun->machine->frame.wb_candidate1 = regno;
2794 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2795 cfun->machine->frame.wb_candidate2 = regno;
2796 offset += UNITS_PER_WORD;
2799 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2800 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2802 cfun->machine->frame.reg_offset[regno] = offset;
2803 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2804 cfun->machine->frame.wb_candidate1 = regno;
2805 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2806 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2807 cfun->machine->frame.wb_candidate2 = regno;
2808 offset += UNITS_PER_WORD;
2811 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2813 cfun->machine->frame.saved_regs_size = offset;
2815 HOST_WIDE_INT varargs_and_saved_regs_size
2816 = offset + cfun->machine->frame.saved_varargs_size;
2818 cfun->machine->frame.hard_fp_offset
2819 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2820 STACK_BOUNDARY / BITS_PER_UNIT);
2822 cfun->machine->frame.frame_size
2823 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2824 + crtl->outgoing_args_size,
2825 STACK_BOUNDARY / BITS_PER_UNIT);
2827 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2829 cfun->machine->frame.initial_adjust = 0;
2830 cfun->machine->frame.final_adjust = 0;
2831 cfun->machine->frame.callee_adjust = 0;
2832 cfun->machine->frame.callee_offset = 0;
2834 HOST_WIDE_INT max_push_offset = 0;
2835 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2836 max_push_offset = 512;
2837 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2838 max_push_offset = 256;
2840 if (cfun->machine->frame.frame_size < max_push_offset
2841 && crtl->outgoing_args_size == 0)
2843 /* Simple, small frame with no outgoing arguments:
2844 stp reg1, reg2, [sp, -frame_size]!
2845 stp reg3, reg4, [sp, 16] */
2846 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2848 else if ((crtl->outgoing_args_size
2849 + cfun->machine->frame.saved_regs_size < 512)
2850 && !(cfun->calls_alloca
2851 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2853 /* Frame with small outgoing arguments:
2854 sub sp, sp, frame_size
2855 stp reg1, reg2, [sp, outgoing_args_size]
2856 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2857 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2858 cfun->machine->frame.callee_offset
2859 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2861 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2863 /* Frame with large outgoing arguments but a small local area:
2864 stp reg1, reg2, [sp, -hard_fp_offset]!
2865 stp reg3, reg4, [sp, 16]
2866 sub sp, sp, outgoing_args_size */
2867 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2868 cfun->machine->frame.final_adjust
2869 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2871 else if (!frame_pointer_needed
2872 && varargs_and_saved_regs_size < max_push_offset)
2874 /* Frame with large local area and outgoing arguments (this pushes the
2875 callee-saves first, followed by the locals and outgoing area):
2876 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2877 stp reg3, reg4, [sp, 16]
2878 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2879 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2880 cfun->machine->frame.final_adjust
2881 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2882 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2883 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2885 else
2887 /* Frame with large local area and outgoing arguments using frame pointer:
2888 sub sp, sp, hard_fp_offset
2889 stp x29, x30, [sp, 0]
2890 add x29, sp, 0
2891 stp reg3, reg4, [sp, 16]
2892 sub sp, sp, outgoing_args_size */
2893 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2894 cfun->machine->frame.final_adjust
2895 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2898 cfun->machine->frame.laid_out = true;
2901 static bool
2902 aarch64_register_saved_on_entry (int regno)
2904 return cfun->machine->frame.reg_offset[regno] >= 0;
2907 static unsigned
2908 aarch64_next_callee_save (unsigned regno, unsigned limit)
2910 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2911 regno ++;
2912 return regno;
2915 static void
2916 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2917 HOST_WIDE_INT adjustment)
2919 rtx base_rtx = stack_pointer_rtx;
2920 rtx insn, reg, mem;
2922 reg = gen_rtx_REG (mode, regno);
2923 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2924 plus_constant (Pmode, base_rtx, -adjustment));
2925 mem = gen_rtx_MEM (mode, mem);
2927 insn = emit_move_insn (mem, reg);
2928 RTX_FRAME_RELATED_P (insn) = 1;
2931 static rtx
2932 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2933 HOST_WIDE_INT adjustment)
2935 switch (mode)
2937 case DImode:
2938 return gen_storewb_pairdi_di (base, base, reg, reg2,
2939 GEN_INT (-adjustment),
2940 GEN_INT (UNITS_PER_WORD - adjustment));
2941 case DFmode:
2942 return gen_storewb_pairdf_di (base, base, reg, reg2,
2943 GEN_INT (-adjustment),
2944 GEN_INT (UNITS_PER_WORD - adjustment));
2945 default:
2946 gcc_unreachable ();
2950 static void
2951 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
2953 rtx_insn *insn;
2954 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2956 if (regno2 == INVALID_REGNUM)
2957 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
2959 rtx reg1 = gen_rtx_REG (mode, regno1);
2960 rtx reg2 = gen_rtx_REG (mode, regno2);
2962 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2963 reg2, adjustment));
2964 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2965 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2966 RTX_FRAME_RELATED_P (insn) = 1;
2969 static rtx
2970 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2971 HOST_WIDE_INT adjustment)
2973 switch (mode)
2975 case DImode:
2976 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2977 GEN_INT (UNITS_PER_WORD));
2978 case DFmode:
2979 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2980 GEN_INT (UNITS_PER_WORD));
2981 default:
2982 gcc_unreachable ();
2986 static void
2987 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
2988 rtx *cfi_ops)
2990 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2991 rtx reg1 = gen_rtx_REG (mode, regno1);
2993 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
2995 if (regno2 == INVALID_REGNUM)
2997 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
2998 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2999 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3001 else
3003 rtx reg2 = gen_rtx_REG (mode, regno2);
3004 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3005 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3006 reg2, adjustment));
3010 static rtx
3011 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3012 rtx reg2)
3014 switch (mode)
3016 case DImode:
3017 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3019 case DFmode:
3020 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3022 default:
3023 gcc_unreachable ();
3027 static rtx
3028 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3029 rtx mem2)
3031 switch (mode)
3033 case DImode:
3034 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3036 case DFmode:
3037 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3039 default:
3040 gcc_unreachable ();
3045 static void
3046 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3047 unsigned start, unsigned limit, bool skip_wb)
3049 rtx_insn *insn;
3050 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3051 ? gen_frame_mem : gen_rtx_MEM);
3052 unsigned regno;
3053 unsigned regno2;
3055 for (regno = aarch64_next_callee_save (start, limit);
3056 regno <= limit;
3057 regno = aarch64_next_callee_save (regno + 1, limit))
3059 rtx reg, mem;
3060 HOST_WIDE_INT offset;
3062 if (skip_wb
3063 && (regno == cfun->machine->frame.wb_candidate1
3064 || regno == cfun->machine->frame.wb_candidate2))
3065 continue;
3067 reg = gen_rtx_REG (mode, regno);
3068 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3069 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3070 offset));
3072 regno2 = aarch64_next_callee_save (regno + 1, limit);
3074 if (regno2 <= limit
3075 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3076 == cfun->machine->frame.reg_offset[regno2]))
3079 rtx reg2 = gen_rtx_REG (mode, regno2);
3080 rtx mem2;
3082 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3083 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3084 offset));
3085 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3086 reg2));
3088 /* The first part of a frame-related parallel insn is
3089 always assumed to be relevant to the frame
3090 calculations; subsequent parts, are only
3091 frame-related if explicitly marked. */
3092 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3093 regno = regno2;
3095 else
3096 insn = emit_move_insn (mem, reg);
3098 RTX_FRAME_RELATED_P (insn) = 1;
3102 static void
3103 aarch64_restore_callee_saves (machine_mode mode,
3104 HOST_WIDE_INT start_offset, unsigned start,
3105 unsigned limit, bool skip_wb, rtx *cfi_ops)
3107 rtx base_rtx = stack_pointer_rtx;
3108 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3109 ? gen_frame_mem : gen_rtx_MEM);
3110 unsigned regno;
3111 unsigned regno2;
3112 HOST_WIDE_INT offset;
3114 for (regno = aarch64_next_callee_save (start, limit);
3115 regno <= limit;
3116 regno = aarch64_next_callee_save (regno + 1, limit))
3118 rtx reg, mem;
3120 if (skip_wb
3121 && (regno == cfun->machine->frame.wb_candidate1
3122 || regno == cfun->machine->frame.wb_candidate2))
3123 continue;
3125 reg = gen_rtx_REG (mode, regno);
3126 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3127 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3129 regno2 = aarch64_next_callee_save (regno + 1, limit);
3131 if (regno2 <= limit
3132 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3133 == cfun->machine->frame.reg_offset[regno2]))
3135 rtx reg2 = gen_rtx_REG (mode, regno2);
3136 rtx mem2;
3138 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3139 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3140 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3142 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3143 regno = regno2;
3145 else
3146 emit_move_insn (reg, mem);
3147 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3151 /* AArch64 stack frames generated by this compiler look like:
3153 +-------------------------------+
3155 | incoming stack arguments |
3157 +-------------------------------+
3158 | | <-- incoming stack pointer (aligned)
3159 | callee-allocated save area |
3160 | for register varargs |
3162 +-------------------------------+
3163 | local variables | <-- frame_pointer_rtx
3165 +-------------------------------+
3166 | padding0 | \
3167 +-------------------------------+ |
3168 | callee-saved registers | | frame.saved_regs_size
3169 +-------------------------------+ |
3170 | LR' | |
3171 +-------------------------------+ |
3172 | FP' | / <- hard_frame_pointer_rtx (aligned)
3173 +-------------------------------+
3174 | dynamic allocation |
3175 +-------------------------------+
3176 | padding |
3177 +-------------------------------+
3178 | outgoing stack arguments | <-- arg_pointer
3180 +-------------------------------+
3181 | | <-- stack_pointer_rtx (aligned)
3183 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3184 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3185 unchanged. */
3187 /* Generate the prologue instructions for entry into a function.
3188 Establish the stack frame by decreasing the stack pointer with a
3189 properly calculated size and, if necessary, create a frame record
3190 filled with the values of LR and previous frame pointer. The
3191 current FP is also set up if it is in use. */
3193 void
3194 aarch64_expand_prologue (void)
3196 aarch64_layout_frame ();
3198 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3199 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3200 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3201 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3202 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3203 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3204 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3205 rtx_insn *insn;
3207 if (flag_stack_usage_info)
3208 current_function_static_stack_size = frame_size;
3210 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3212 if (crtl->is_leaf && !cfun->calls_alloca)
3214 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3215 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3216 frame_size - STACK_CHECK_PROTECT);
3218 else if (frame_size > 0)
3219 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3222 aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
3224 if (callee_adjust != 0)
3225 aarch64_push_regs (reg1, reg2, callee_adjust);
3227 if (frame_pointer_needed)
3229 if (callee_adjust == 0)
3230 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3231 R30_REGNUM, false);
3232 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3233 stack_pointer_rtx,
3234 GEN_INT (callee_offset)));
3235 RTX_FRAME_RELATED_P (insn) = 1;
3236 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3239 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3240 callee_adjust != 0 || frame_pointer_needed);
3241 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3242 callee_adjust != 0 || frame_pointer_needed);
3243 aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
3244 !frame_pointer_needed);
3247 /* Return TRUE if we can use a simple_return insn.
3249 This function checks whether the callee saved stack is empty, which
3250 means no restore actions are need. The pro_and_epilogue will use
3251 this to check whether shrink-wrapping opt is feasible. */
3253 bool
3254 aarch64_use_return_insn_p (void)
3256 if (!reload_completed)
3257 return false;
3259 if (crtl->profile)
3260 return false;
3262 aarch64_layout_frame ();
3264 return cfun->machine->frame.frame_size == 0;
3267 /* Generate the epilogue instructions for returning from a function.
3268 This is almost exactly the reverse of the prolog sequence, except
3269 that we need to insert barriers to avoid scheduling loads that read
3270 from a deallocated stack, and we optimize the unwind records by
3271 emitting them all together if possible. */
3272 void
3273 aarch64_expand_epilogue (bool for_sibcall)
3275 aarch64_layout_frame ();
3277 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3278 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3279 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3280 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3281 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3282 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3283 rtx cfi_ops = NULL;
3284 rtx_insn *insn;
3286 /* We need to add memory barrier to prevent read from deallocated stack. */
3287 bool need_barrier_p = (get_frame_size ()
3288 + cfun->machine->frame.saved_varargs_size) != 0;
3290 /* Emit a barrier to prevent loads from a deallocated stack. */
3291 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3293 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3294 need_barrier_p = false;
3297 /* Restore the stack pointer from the frame pointer if it may not
3298 be the same as the stack pointer. */
3299 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3301 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3302 hard_frame_pointer_rtx,
3303 GEN_INT (-callee_offset)));
3304 /* If writeback is used when restoring callee-saves, the CFA
3305 is restored on the instruction doing the writeback. */
3306 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3308 else
3309 aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
3311 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3312 callee_adjust != 0, &cfi_ops);
3313 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3314 callee_adjust != 0, &cfi_ops);
3316 if (need_barrier_p)
3317 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3319 if (callee_adjust != 0)
3320 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3322 if (callee_adjust != 0 || initial_adjust > 65536)
3324 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3325 insn = get_last_insn ();
3326 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3327 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3328 RTX_FRAME_RELATED_P (insn) = 1;
3329 cfi_ops = NULL;
3332 aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
3334 if (cfi_ops)
3336 /* Emit delayed restores and reset the CFA to be SP. */
3337 insn = get_last_insn ();
3338 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3339 REG_NOTES (insn) = cfi_ops;
3340 RTX_FRAME_RELATED_P (insn) = 1;
3343 /* Stack adjustment for exception handler. */
3344 if (crtl->calls_eh_return)
3346 /* We need to unwind the stack by the offset computed by
3347 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3348 to be SP; letting the CFA move during this adjustment
3349 is just as correct as retaining the CFA from the body
3350 of the function. Therefore, do nothing special. */
3351 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3354 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3355 if (!for_sibcall)
3356 emit_jump_insn (ret_rtx);
3359 /* Return the place to copy the exception unwinding return address to.
3360 This will probably be a stack slot, but could (in theory be the
3361 return register). */
3363 aarch64_final_eh_return_addr (void)
3365 HOST_WIDE_INT fp_offset;
3367 aarch64_layout_frame ();
3369 fp_offset = cfun->machine->frame.frame_size
3370 - cfun->machine->frame.hard_fp_offset;
3372 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3373 return gen_rtx_REG (DImode, LR_REGNUM);
3375 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3376 result in a store to save LR introduced by builtin_eh_return () being
3377 incorrectly deleted because the alias is not detected.
3378 So in the calculation of the address to copy the exception unwinding
3379 return address to, we note 2 cases.
3380 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3381 we return a SP-relative location since all the addresses are SP-relative
3382 in this case. This prevents the store from being optimized away.
3383 If the fp_offset is not 0, then the addresses will be FP-relative and
3384 therefore we return a FP-relative location. */
3386 if (frame_pointer_needed)
3388 if (fp_offset)
3389 return gen_frame_mem (DImode,
3390 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3391 else
3392 return gen_frame_mem (DImode,
3393 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3396 /* If FP is not needed, we calculate the location of LR, which would be
3397 at the top of the saved registers block. */
3399 return gen_frame_mem (DImode,
3400 plus_constant (Pmode,
3401 stack_pointer_rtx,
3402 fp_offset
3403 + cfun->machine->frame.saved_regs_size
3404 - 2 * UNITS_PER_WORD));
3407 /* Output code to add DELTA to the first argument, and then jump
3408 to FUNCTION. Used for C++ multiple inheritance. */
3409 static void
3410 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3411 HOST_WIDE_INT delta,
3412 HOST_WIDE_INT vcall_offset,
3413 tree function)
3415 /* The this pointer is always in x0. Note that this differs from
3416 Arm where the this pointer maybe bumped to r1 if r0 is required
3417 to return a pointer to an aggregate. On AArch64 a result value
3418 pointer will be in x8. */
3419 int this_regno = R0_REGNUM;
3420 rtx this_rtx, temp0, temp1, addr, funexp;
3421 rtx_insn *insn;
3423 reload_completed = 1;
3424 emit_note (NOTE_INSN_PROLOGUE_END);
3426 if (vcall_offset == 0)
3427 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3428 else
3430 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3432 this_rtx = gen_rtx_REG (Pmode, this_regno);
3433 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3434 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3436 addr = this_rtx;
3437 if (delta != 0)
3439 if (delta >= -256 && delta < 256)
3440 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3441 plus_constant (Pmode, this_rtx, delta));
3442 else
3443 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3446 if (Pmode == ptr_mode)
3447 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3448 else
3449 aarch64_emit_move (temp0,
3450 gen_rtx_ZERO_EXTEND (Pmode,
3451 gen_rtx_MEM (ptr_mode, addr)));
3453 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3454 addr = plus_constant (Pmode, temp0, vcall_offset);
3455 else
3457 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3458 Pmode);
3459 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3462 if (Pmode == ptr_mode)
3463 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3464 else
3465 aarch64_emit_move (temp1,
3466 gen_rtx_SIGN_EXTEND (Pmode,
3467 gen_rtx_MEM (ptr_mode, addr)));
3469 emit_insn (gen_add2_insn (this_rtx, temp1));
3472 /* Generate a tail call to the target function. */
3473 if (!TREE_USED (function))
3475 assemble_external (function);
3476 TREE_USED (function) = 1;
3478 funexp = XEXP (DECL_RTL (function), 0);
3479 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3480 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3481 SIBLING_CALL_P (insn) = 1;
3483 insn = get_insns ();
3484 shorten_branches (insn);
3485 final_start_function (insn, file, 1);
3486 final (insn, file, 1);
3487 final_end_function ();
3489 /* Stop pretending to be a post-reload pass. */
3490 reload_completed = 0;
3493 static bool
3494 aarch64_tls_referenced_p (rtx x)
3496 if (!TARGET_HAVE_TLS)
3497 return false;
3498 subrtx_iterator::array_type array;
3499 FOR_EACH_SUBRTX (iter, array, x, ALL)
3501 const_rtx x = *iter;
3502 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3503 return true;
3504 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3505 TLS offsets, not real symbol references. */
3506 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3507 iter.skip_subrtxes ();
3509 return false;
3513 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3514 a left shift of 0 or 12 bits. */
3515 bool
3516 aarch64_uimm12_shift (HOST_WIDE_INT val)
3518 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3519 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3524 /* Return true if val is an immediate that can be loaded into a
3525 register by a MOVZ instruction. */
3526 static bool
3527 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3529 if (GET_MODE_SIZE (mode) > 4)
3531 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3532 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3533 return 1;
3535 else
3537 /* Ignore sign extension. */
3538 val &= (HOST_WIDE_INT) 0xffffffff;
3540 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3541 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3544 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3546 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3548 0x0000000100000001ull,
3549 0x0001000100010001ull,
3550 0x0101010101010101ull,
3551 0x1111111111111111ull,
3552 0x5555555555555555ull,
3556 /* Return true if val is a valid bitmask immediate. */
3558 bool
3559 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3561 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3562 int bits;
3564 /* Check for a single sequence of one bits and return quickly if so.
3565 The special cases of all ones and all zeroes returns false. */
3566 val = (unsigned HOST_WIDE_INT) val_in;
3567 tmp = val + (val & -val);
3569 if (tmp == (tmp & -tmp))
3570 return (val + 1) > 1;
3572 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3573 if (mode == SImode)
3574 val = (val << 32) | (val & 0xffffffff);
3576 /* Invert if the immediate doesn't start with a zero bit - this means we
3577 only need to search for sequences of one bits. */
3578 if (val & 1)
3579 val = ~val;
3581 /* Find the first set bit and set tmp to val with the first sequence of one
3582 bits removed. Return success if there is a single sequence of ones. */
3583 first_one = val & -val;
3584 tmp = val & (val + first_one);
3586 if (tmp == 0)
3587 return true;
3589 /* Find the next set bit and compute the difference in bit position. */
3590 next_one = tmp & -tmp;
3591 bits = clz_hwi (first_one) - clz_hwi (next_one);
3592 mask = val ^ tmp;
3594 /* Check the bit position difference is a power of 2, and that the first
3595 sequence of one bits fits within 'bits' bits. */
3596 if ((mask >> bits) != 0 || bits != (bits & -bits))
3597 return false;
3599 /* Check the sequence of one bits is repeated 64/bits times. */
3600 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3604 /* Return true if val is an immediate that can be loaded into a
3605 register in a single instruction. */
3606 bool
3607 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3609 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3610 return 1;
3611 return aarch64_bitmask_imm (val, mode);
3614 static bool
3615 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3617 rtx base, offset;
3619 if (GET_CODE (x) == HIGH)
3620 return true;
3622 split_const (x, &base, &offset);
3623 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3625 if (aarch64_classify_symbol (base, offset)
3626 != SYMBOL_FORCE_TO_MEM)
3627 return true;
3628 else
3629 /* Avoid generating a 64-bit relocation in ILP32; leave
3630 to aarch64_expand_mov_immediate to handle it properly. */
3631 return mode != ptr_mode;
3634 return aarch64_tls_referenced_p (x);
3637 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3638 The expansion for a table switch is quite expensive due to the number
3639 of instructions, the table lookup and hard to predict indirect jump.
3640 When optimizing for speed, and -O3 enabled, use the per-core tuning if
3641 set, otherwise use tables for > 16 cases as a tradeoff between size and
3642 performance. When optimizing for size, use the default setting. */
3644 static unsigned int
3645 aarch64_case_values_threshold (void)
3647 /* Use the specified limit for the number of cases before using jump
3648 tables at higher optimization levels. */
3649 if (optimize > 2
3650 && selected_cpu->tune->max_case_values != 0)
3651 return selected_cpu->tune->max_case_values;
3652 else
3653 return optimize_size ? default_case_values_threshold () : 17;
3656 /* Return true if register REGNO is a valid index register.
3657 STRICT_P is true if REG_OK_STRICT is in effect. */
3659 bool
3660 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3662 if (!HARD_REGISTER_NUM_P (regno))
3664 if (!strict_p)
3665 return true;
3667 if (!reg_renumber)
3668 return false;
3670 regno = reg_renumber[regno];
3672 return GP_REGNUM_P (regno);
3675 /* Return true if register REGNO is a valid base register for mode MODE.
3676 STRICT_P is true if REG_OK_STRICT is in effect. */
3678 bool
3679 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3681 if (!HARD_REGISTER_NUM_P (regno))
3683 if (!strict_p)
3684 return true;
3686 if (!reg_renumber)
3687 return false;
3689 regno = reg_renumber[regno];
3692 /* The fake registers will be eliminated to either the stack or
3693 hard frame pointer, both of which are usually valid base registers.
3694 Reload deals with the cases where the eliminated form isn't valid. */
3695 return (GP_REGNUM_P (regno)
3696 || regno == SP_REGNUM
3697 || regno == FRAME_POINTER_REGNUM
3698 || regno == ARG_POINTER_REGNUM);
3701 /* Return true if X is a valid base register for mode MODE.
3702 STRICT_P is true if REG_OK_STRICT is in effect. */
3704 static bool
3705 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3707 if (!strict_p && GET_CODE (x) == SUBREG)
3708 x = SUBREG_REG (x);
3710 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3713 /* Return true if address offset is a valid index. If it is, fill in INFO
3714 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3716 static bool
3717 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3718 machine_mode mode, bool strict_p)
3720 enum aarch64_address_type type;
3721 rtx index;
3722 int shift;
3724 /* (reg:P) */
3725 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3726 && GET_MODE (x) == Pmode)
3728 type = ADDRESS_REG_REG;
3729 index = x;
3730 shift = 0;
3732 /* (sign_extend:DI (reg:SI)) */
3733 else if ((GET_CODE (x) == SIGN_EXTEND
3734 || GET_CODE (x) == ZERO_EXTEND)
3735 && GET_MODE (x) == DImode
3736 && GET_MODE (XEXP (x, 0)) == SImode)
3738 type = (GET_CODE (x) == SIGN_EXTEND)
3739 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3740 index = XEXP (x, 0);
3741 shift = 0;
3743 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3744 else if (GET_CODE (x) == MULT
3745 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3746 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3747 && GET_MODE (XEXP (x, 0)) == DImode
3748 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3749 && CONST_INT_P (XEXP (x, 1)))
3751 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3752 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3753 index = XEXP (XEXP (x, 0), 0);
3754 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3756 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3757 else if (GET_CODE (x) == ASHIFT
3758 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3759 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3760 && GET_MODE (XEXP (x, 0)) == DImode
3761 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3762 && CONST_INT_P (XEXP (x, 1)))
3764 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3765 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3766 index = XEXP (XEXP (x, 0), 0);
3767 shift = INTVAL (XEXP (x, 1));
3769 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3770 else if ((GET_CODE (x) == SIGN_EXTRACT
3771 || GET_CODE (x) == ZERO_EXTRACT)
3772 && GET_MODE (x) == DImode
3773 && GET_CODE (XEXP (x, 0)) == MULT
3774 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3775 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3777 type = (GET_CODE (x) == SIGN_EXTRACT)
3778 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3779 index = XEXP (XEXP (x, 0), 0);
3780 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3781 if (INTVAL (XEXP (x, 1)) != 32 + shift
3782 || INTVAL (XEXP (x, 2)) != 0)
3783 shift = -1;
3785 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3786 (const_int 0xffffffff<<shift)) */
3787 else if (GET_CODE (x) == AND
3788 && GET_MODE (x) == DImode
3789 && GET_CODE (XEXP (x, 0)) == MULT
3790 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3791 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3792 && CONST_INT_P (XEXP (x, 1)))
3794 type = ADDRESS_REG_UXTW;
3795 index = XEXP (XEXP (x, 0), 0);
3796 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3797 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3798 shift = -1;
3800 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3801 else if ((GET_CODE (x) == SIGN_EXTRACT
3802 || GET_CODE (x) == ZERO_EXTRACT)
3803 && GET_MODE (x) == DImode
3804 && GET_CODE (XEXP (x, 0)) == ASHIFT
3805 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3806 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3808 type = (GET_CODE (x) == SIGN_EXTRACT)
3809 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3810 index = XEXP (XEXP (x, 0), 0);
3811 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3812 if (INTVAL (XEXP (x, 1)) != 32 + shift
3813 || INTVAL (XEXP (x, 2)) != 0)
3814 shift = -1;
3816 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3817 (const_int 0xffffffff<<shift)) */
3818 else if (GET_CODE (x) == AND
3819 && GET_MODE (x) == DImode
3820 && GET_CODE (XEXP (x, 0)) == ASHIFT
3821 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3822 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3823 && CONST_INT_P (XEXP (x, 1)))
3825 type = ADDRESS_REG_UXTW;
3826 index = XEXP (XEXP (x, 0), 0);
3827 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3828 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3829 shift = -1;
3831 /* (mult:P (reg:P) (const_int scale)) */
3832 else if (GET_CODE (x) == MULT
3833 && GET_MODE (x) == Pmode
3834 && GET_MODE (XEXP (x, 0)) == Pmode
3835 && CONST_INT_P (XEXP (x, 1)))
3837 type = ADDRESS_REG_REG;
3838 index = XEXP (x, 0);
3839 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3841 /* (ashift:P (reg:P) (const_int shift)) */
3842 else if (GET_CODE (x) == ASHIFT
3843 && GET_MODE (x) == Pmode
3844 && GET_MODE (XEXP (x, 0)) == Pmode
3845 && CONST_INT_P (XEXP (x, 1)))
3847 type = ADDRESS_REG_REG;
3848 index = XEXP (x, 0);
3849 shift = INTVAL (XEXP (x, 1));
3851 else
3852 return false;
3854 if (GET_CODE (index) == SUBREG)
3855 index = SUBREG_REG (index);
3857 if ((shift == 0 ||
3858 (shift > 0 && shift <= 3
3859 && (1 << shift) == GET_MODE_SIZE (mode)))
3860 && REG_P (index)
3861 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3863 info->type = type;
3864 info->offset = index;
3865 info->shift = shift;
3866 return true;
3869 return false;
3872 bool
3873 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3875 return (offset >= -64 * GET_MODE_SIZE (mode)
3876 && offset < 64 * GET_MODE_SIZE (mode)
3877 && offset % GET_MODE_SIZE (mode) == 0);
3880 static inline bool
3881 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3882 HOST_WIDE_INT offset)
3884 return offset >= -256 && offset < 256;
3887 static inline bool
3888 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3890 return (offset >= 0
3891 && offset < 4096 * GET_MODE_SIZE (mode)
3892 && offset % GET_MODE_SIZE (mode) == 0);
3895 /* Return true if MODE is one of the modes for which we
3896 support LDP/STP operations. */
3898 static bool
3899 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3901 return mode == SImode || mode == DImode
3902 || mode == SFmode || mode == DFmode
3903 || (aarch64_vector_mode_supported_p (mode)
3904 && GET_MODE_SIZE (mode) == 8);
3907 /* Return true if REGNO is a virtual pointer register, or an eliminable
3908 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
3909 include stack_pointer or hard_frame_pointer. */
3910 static bool
3911 virt_or_elim_regno_p (unsigned regno)
3913 return ((regno >= FIRST_VIRTUAL_REGISTER
3914 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3915 || regno == FRAME_POINTER_REGNUM
3916 || regno == ARG_POINTER_REGNUM);
3919 /* Return true if X is a valid address for machine mode MODE. If it is,
3920 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3921 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3923 static bool
3924 aarch64_classify_address (struct aarch64_address_info *info,
3925 rtx x, machine_mode mode,
3926 RTX_CODE outer_code, bool strict_p)
3928 enum rtx_code code = GET_CODE (x);
3929 rtx op0, op1;
3931 /* On BE, we use load/store pair for all large int mode load/stores. */
3932 bool load_store_pair_p = (outer_code == PARALLEL
3933 || (BYTES_BIG_ENDIAN
3934 && aarch64_vect_struct_mode_p (mode)));
3936 bool allow_reg_index_p =
3937 !load_store_pair_p
3938 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3939 && !aarch64_vect_struct_mode_p (mode);
3941 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3942 REG addressing. */
3943 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3944 && (code != POST_INC && code != REG))
3945 return false;
3947 switch (code)
3949 case REG:
3950 case SUBREG:
3951 info->type = ADDRESS_REG_IMM;
3952 info->base = x;
3953 info->offset = const0_rtx;
3954 return aarch64_base_register_rtx_p (x, strict_p);
3956 case PLUS:
3957 op0 = XEXP (x, 0);
3958 op1 = XEXP (x, 1);
3960 if (! strict_p
3961 && REG_P (op0)
3962 && virt_or_elim_regno_p (REGNO (op0))
3963 && CONST_INT_P (op1))
3965 info->type = ADDRESS_REG_IMM;
3966 info->base = op0;
3967 info->offset = op1;
3969 return true;
3972 if (GET_MODE_SIZE (mode) != 0
3973 && CONST_INT_P (op1)
3974 && aarch64_base_register_rtx_p (op0, strict_p))
3976 HOST_WIDE_INT offset = INTVAL (op1);
3978 info->type = ADDRESS_REG_IMM;
3979 info->base = op0;
3980 info->offset = op1;
3982 /* TImode and TFmode values are allowed in both pairs of X
3983 registers and individual Q registers. The available
3984 address modes are:
3985 X,X: 7-bit signed scaled offset
3986 Q: 9-bit signed offset
3987 We conservatively require an offset representable in either mode.
3988 When performing the check for pairs of X registers i.e. LDP/STP
3989 pass down DImode since that is the natural size of the LDP/STP
3990 instruction memory accesses. */
3991 if (mode == TImode || mode == TFmode)
3992 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3993 && offset_9bit_signed_unscaled_p (mode, offset));
3995 /* A 7bit offset check because OImode will emit a ldp/stp
3996 instruction (only big endian will get here).
3997 For ldp/stp instructions, the offset is scaled for the size of a
3998 single element of the pair. */
3999 if (mode == OImode)
4000 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4002 /* Three 9/12 bit offsets checks because CImode will emit three
4003 ldr/str instructions (only big endian will get here). */
4004 if (mode == CImode)
4005 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4006 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4007 || offset_12bit_unsigned_scaled_p (V16QImode,
4008 offset + 32)));
4010 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4011 instructions (only big endian will get here). */
4012 if (mode == XImode)
4013 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4014 && aarch64_offset_7bit_signed_scaled_p (TImode,
4015 offset + 32));
4017 if (load_store_pair_p)
4018 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4019 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4020 else
4021 return (offset_9bit_signed_unscaled_p (mode, offset)
4022 || offset_12bit_unsigned_scaled_p (mode, offset));
4025 if (allow_reg_index_p)
4027 /* Look for base + (scaled/extended) index register. */
4028 if (aarch64_base_register_rtx_p (op0, strict_p)
4029 && aarch64_classify_index (info, op1, mode, strict_p))
4031 info->base = op0;
4032 return true;
4034 if (aarch64_base_register_rtx_p (op1, strict_p)
4035 && aarch64_classify_index (info, op0, mode, strict_p))
4037 info->base = op1;
4038 return true;
4042 return false;
4044 case POST_INC:
4045 case POST_DEC:
4046 case PRE_INC:
4047 case PRE_DEC:
4048 info->type = ADDRESS_REG_WB;
4049 info->base = XEXP (x, 0);
4050 info->offset = NULL_RTX;
4051 return aarch64_base_register_rtx_p (info->base, strict_p);
4053 case POST_MODIFY:
4054 case PRE_MODIFY:
4055 info->type = ADDRESS_REG_WB;
4056 info->base = XEXP (x, 0);
4057 if (GET_CODE (XEXP (x, 1)) == PLUS
4058 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4059 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4060 && aarch64_base_register_rtx_p (info->base, strict_p))
4062 HOST_WIDE_INT offset;
4063 info->offset = XEXP (XEXP (x, 1), 1);
4064 offset = INTVAL (info->offset);
4066 /* TImode and TFmode values are allowed in both pairs of X
4067 registers and individual Q registers. The available
4068 address modes are:
4069 X,X: 7-bit signed scaled offset
4070 Q: 9-bit signed offset
4071 We conservatively require an offset representable in either mode.
4073 if (mode == TImode || mode == TFmode)
4074 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4075 && offset_9bit_signed_unscaled_p (mode, offset));
4077 if (load_store_pair_p)
4078 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4079 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4080 else
4081 return offset_9bit_signed_unscaled_p (mode, offset);
4083 return false;
4085 case CONST:
4086 case SYMBOL_REF:
4087 case LABEL_REF:
4088 /* load literal: pc-relative constant pool entry. Only supported
4089 for SI mode or larger. */
4090 info->type = ADDRESS_SYMBOLIC;
4092 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4094 rtx sym, addend;
4096 split_const (x, &sym, &addend);
4097 return ((GET_CODE (sym) == LABEL_REF
4098 || (GET_CODE (sym) == SYMBOL_REF
4099 && CONSTANT_POOL_ADDRESS_P (sym)
4100 && aarch64_pcrelative_literal_loads)));
4102 return false;
4104 case LO_SUM:
4105 info->type = ADDRESS_LO_SUM;
4106 info->base = XEXP (x, 0);
4107 info->offset = XEXP (x, 1);
4108 if (allow_reg_index_p
4109 && aarch64_base_register_rtx_p (info->base, strict_p))
4111 rtx sym, offs;
4112 split_const (info->offset, &sym, &offs);
4113 if (GET_CODE (sym) == SYMBOL_REF
4114 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4116 /* The symbol and offset must be aligned to the access size. */
4117 unsigned int align;
4118 unsigned int ref_size;
4120 if (CONSTANT_POOL_ADDRESS_P (sym))
4121 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4122 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4124 tree exp = SYMBOL_REF_DECL (sym);
4125 align = TYPE_ALIGN (TREE_TYPE (exp));
4126 align = CONSTANT_ALIGNMENT (exp, align);
4128 else if (SYMBOL_REF_DECL (sym))
4129 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4130 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4131 && SYMBOL_REF_BLOCK (sym) != NULL)
4132 align = SYMBOL_REF_BLOCK (sym)->alignment;
4133 else
4134 align = BITS_PER_UNIT;
4136 ref_size = GET_MODE_SIZE (mode);
4137 if (ref_size == 0)
4138 ref_size = GET_MODE_SIZE (DImode);
4140 return ((INTVAL (offs) & (ref_size - 1)) == 0
4141 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4144 return false;
4146 default:
4147 return false;
4151 bool
4152 aarch64_symbolic_address_p (rtx x)
4154 rtx offset;
4156 split_const (x, &x, &offset);
4157 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4160 /* Classify the base of symbolic expression X. */
4162 enum aarch64_symbol_type
4163 aarch64_classify_symbolic_expression (rtx x)
4165 rtx offset;
4167 split_const (x, &x, &offset);
4168 return aarch64_classify_symbol (x, offset);
4172 /* Return TRUE if X is a legitimate address for accessing memory in
4173 mode MODE. */
4174 static bool
4175 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4177 struct aarch64_address_info addr;
4179 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4182 /* Return TRUE if X is a legitimate address for accessing memory in
4183 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4184 pair operation. */
4185 bool
4186 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4187 RTX_CODE outer_code, bool strict_p)
4189 struct aarch64_address_info addr;
4191 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4194 /* Return TRUE if rtx X is immediate constant 0.0 */
4195 bool
4196 aarch64_float_const_zero_rtx_p (rtx x)
4198 if (GET_MODE (x) == VOIDmode)
4199 return false;
4201 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4202 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4203 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4206 /* Return the fixed registers used for condition codes. */
4208 static bool
4209 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4211 *p1 = CC_REGNUM;
4212 *p2 = INVALID_REGNUM;
4213 return true;
4216 /* Emit call insn with PAT and do aarch64-specific handling. */
4218 void
4219 aarch64_emit_call_insn (rtx pat)
4221 rtx insn = emit_call_insn (pat);
4223 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4224 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4225 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4228 machine_mode
4229 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4231 /* All floating point compares return CCFP if it is an equality
4232 comparison, and CCFPE otherwise. */
4233 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4235 switch (code)
4237 case EQ:
4238 case NE:
4239 case UNORDERED:
4240 case ORDERED:
4241 case UNLT:
4242 case UNLE:
4243 case UNGT:
4244 case UNGE:
4245 case UNEQ:
4246 case LTGT:
4247 return CCFPmode;
4249 case LT:
4250 case LE:
4251 case GT:
4252 case GE:
4253 return CCFPEmode;
4255 default:
4256 gcc_unreachable ();
4260 /* Equality comparisons of short modes against zero can be performed
4261 using the TST instruction with the appropriate bitmask. */
4262 if (y == const0_rtx && REG_P (x)
4263 && (code == EQ || code == NE)
4264 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4265 return CC_NZmode;
4267 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4268 && y == const0_rtx
4269 && (code == EQ || code == NE || code == LT || code == GE)
4270 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4271 || GET_CODE (x) == NEG
4272 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4273 && CONST_INT_P (XEXP (x, 2)))))
4274 return CC_NZmode;
4276 /* A compare with a shifted operand. Because of canonicalization,
4277 the comparison will have to be swapped when we emit the assembly
4278 code. */
4279 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4280 && (REG_P (y) || GET_CODE (y) == SUBREG)
4281 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4282 || GET_CODE (x) == LSHIFTRT
4283 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4284 return CC_SWPmode;
4286 /* Similarly for a negated operand, but we can only do this for
4287 equalities. */
4288 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4289 && (REG_P (y) || GET_CODE (y) == SUBREG)
4290 && (code == EQ || code == NE)
4291 && GET_CODE (x) == NEG)
4292 return CC_Zmode;
4294 /* A test for unsigned overflow. */
4295 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4296 && code == NE
4297 && GET_CODE (x) == PLUS
4298 && GET_CODE (y) == ZERO_EXTEND)
4299 return CC_Cmode;
4301 /* For everything else, return CCmode. */
4302 return CCmode;
4305 static int
4306 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4309 aarch64_get_condition_code (rtx x)
4311 machine_mode mode = GET_MODE (XEXP (x, 0));
4312 enum rtx_code comp_code = GET_CODE (x);
4314 if (GET_MODE_CLASS (mode) != MODE_CC)
4315 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4316 return aarch64_get_condition_code_1 (mode, comp_code);
4319 static int
4320 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4322 switch (mode)
4324 case CCFPmode:
4325 case CCFPEmode:
4326 switch (comp_code)
4328 case GE: return AARCH64_GE;
4329 case GT: return AARCH64_GT;
4330 case LE: return AARCH64_LS;
4331 case LT: return AARCH64_MI;
4332 case NE: return AARCH64_NE;
4333 case EQ: return AARCH64_EQ;
4334 case ORDERED: return AARCH64_VC;
4335 case UNORDERED: return AARCH64_VS;
4336 case UNLT: return AARCH64_LT;
4337 case UNLE: return AARCH64_LE;
4338 case UNGT: return AARCH64_HI;
4339 case UNGE: return AARCH64_PL;
4340 default: return -1;
4342 break;
4344 case CCmode:
4345 switch (comp_code)
4347 case NE: return AARCH64_NE;
4348 case EQ: return AARCH64_EQ;
4349 case GE: return AARCH64_GE;
4350 case GT: return AARCH64_GT;
4351 case LE: return AARCH64_LE;
4352 case LT: return AARCH64_LT;
4353 case GEU: return AARCH64_CS;
4354 case GTU: return AARCH64_HI;
4355 case LEU: return AARCH64_LS;
4356 case LTU: return AARCH64_CC;
4357 default: return -1;
4359 break;
4361 case CC_SWPmode:
4362 switch (comp_code)
4364 case NE: return AARCH64_NE;
4365 case EQ: return AARCH64_EQ;
4366 case GE: return AARCH64_LE;
4367 case GT: return AARCH64_LT;
4368 case LE: return AARCH64_GE;
4369 case LT: return AARCH64_GT;
4370 case GEU: return AARCH64_LS;
4371 case GTU: return AARCH64_CC;
4372 case LEU: return AARCH64_CS;
4373 case LTU: return AARCH64_HI;
4374 default: return -1;
4376 break;
4378 case CC_NZmode:
4379 switch (comp_code)
4381 case NE: return AARCH64_NE;
4382 case EQ: return AARCH64_EQ;
4383 case GE: return AARCH64_PL;
4384 case LT: return AARCH64_MI;
4385 default: return -1;
4387 break;
4389 case CC_Zmode:
4390 switch (comp_code)
4392 case NE: return AARCH64_NE;
4393 case EQ: return AARCH64_EQ;
4394 default: return -1;
4396 break;
4398 case CC_Cmode:
4399 switch (comp_code)
4401 case NE: return AARCH64_CS;
4402 case EQ: return AARCH64_CC;
4403 default: return -1;
4405 break;
4407 default:
4408 return -1;
4409 break;
4412 return -1;
4415 bool
4416 aarch64_const_vec_all_same_in_range_p (rtx x,
4417 HOST_WIDE_INT minval,
4418 HOST_WIDE_INT maxval)
4420 HOST_WIDE_INT firstval;
4421 int count, i;
4423 if (GET_CODE (x) != CONST_VECTOR
4424 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4425 return false;
4427 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4428 if (firstval < minval || firstval > maxval)
4429 return false;
4431 count = CONST_VECTOR_NUNITS (x);
4432 for (i = 1; i < count; i++)
4433 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4434 return false;
4436 return true;
4439 bool
4440 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4442 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4446 /* N Z C V. */
4447 #define AARCH64_CC_V 1
4448 #define AARCH64_CC_C (1 << 1)
4449 #define AARCH64_CC_Z (1 << 2)
4450 #define AARCH64_CC_N (1 << 3)
4452 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4453 static const int aarch64_nzcv_codes[] =
4455 0, /* EQ, Z == 1. */
4456 AARCH64_CC_Z, /* NE, Z == 0. */
4457 0, /* CS, C == 1. */
4458 AARCH64_CC_C, /* CC, C == 0. */
4459 0, /* MI, N == 1. */
4460 AARCH64_CC_N, /* PL, N == 0. */
4461 0, /* VS, V == 1. */
4462 AARCH64_CC_V, /* VC, V == 0. */
4463 0, /* HI, C ==1 && Z == 0. */
4464 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4465 AARCH64_CC_V, /* GE, N == V. */
4466 0, /* LT, N != V. */
4467 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4468 0, /* LE, !(Z == 0 && N == V). */
4469 0, /* AL, Any. */
4470 0 /* NV, Any. */
4473 static void
4474 aarch64_print_operand (FILE *f, rtx x, int code)
4476 switch (code)
4478 /* An integer or symbol address without a preceding # sign. */
4479 case 'c':
4480 switch (GET_CODE (x))
4482 case CONST_INT:
4483 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4484 break;
4486 case SYMBOL_REF:
4487 output_addr_const (f, x);
4488 break;
4490 case CONST:
4491 if (GET_CODE (XEXP (x, 0)) == PLUS
4492 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4494 output_addr_const (f, x);
4495 break;
4497 /* Fall through. */
4499 default:
4500 output_operand_lossage ("Unsupported operand for code '%c'", code);
4502 break;
4504 case 'e':
4505 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4507 int n;
4509 if (!CONST_INT_P (x)
4510 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4512 output_operand_lossage ("invalid operand for '%%%c'", code);
4513 return;
4516 switch (n)
4518 case 3:
4519 fputc ('b', f);
4520 break;
4521 case 4:
4522 fputc ('h', f);
4523 break;
4524 case 5:
4525 fputc ('w', f);
4526 break;
4527 default:
4528 output_operand_lossage ("invalid operand for '%%%c'", code);
4529 return;
4532 break;
4534 case 'p':
4536 int n;
4538 /* Print N such that 2^N == X. */
4539 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4541 output_operand_lossage ("invalid operand for '%%%c'", code);
4542 return;
4545 asm_fprintf (f, "%d", n);
4547 break;
4549 case 'P':
4550 /* Print the number of non-zero bits in X (a const_int). */
4551 if (!CONST_INT_P (x))
4553 output_operand_lossage ("invalid operand for '%%%c'", code);
4554 return;
4557 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4558 break;
4560 case 'H':
4561 /* Print the higher numbered register of a pair (TImode) of regs. */
4562 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4564 output_operand_lossage ("invalid operand for '%%%c'", code);
4565 return;
4568 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4569 break;
4571 case 'M':
4572 case 'm':
4574 int cond_code;
4575 /* Print a condition (eq, ne, etc) or its inverse. */
4577 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4578 if (x == const_true_rtx)
4580 if (code == 'M')
4581 fputs ("nv", f);
4582 return;
4585 if (!COMPARISON_P (x))
4587 output_operand_lossage ("invalid operand for '%%%c'", code);
4588 return;
4591 cond_code = aarch64_get_condition_code (x);
4592 gcc_assert (cond_code >= 0);
4593 if (code == 'M')
4594 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4595 fputs (aarch64_condition_codes[cond_code], f);
4597 break;
4599 case 'b':
4600 case 'h':
4601 case 's':
4602 case 'd':
4603 case 'q':
4604 /* Print a scalar FP/SIMD register name. */
4605 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4607 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4608 return;
4610 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4611 break;
4613 case 'S':
4614 case 'T':
4615 case 'U':
4616 case 'V':
4617 /* Print the first FP/SIMD register name in a list. */
4618 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4620 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4621 return;
4623 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4624 break;
4626 case 'R':
4627 /* Print a scalar FP/SIMD register name + 1. */
4628 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4630 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4631 return;
4633 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4634 break;
4636 case 'X':
4637 /* Print bottom 16 bits of integer constant in hex. */
4638 if (!CONST_INT_P (x))
4640 output_operand_lossage ("invalid operand for '%%%c'", code);
4641 return;
4643 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4644 break;
4646 case 'w':
4647 case 'x':
4648 /* Print a general register name or the zero register (32-bit or
4649 64-bit). */
4650 if (x == const0_rtx
4651 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4653 asm_fprintf (f, "%czr", code);
4654 break;
4657 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4659 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4660 break;
4663 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4665 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4666 break;
4669 /* Fall through */
4671 case 0:
4672 /* Print a normal operand, if it's a general register, then we
4673 assume DImode. */
4674 if (x == NULL)
4676 output_operand_lossage ("missing operand");
4677 return;
4680 switch (GET_CODE (x))
4682 case REG:
4683 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4684 break;
4686 case MEM:
4687 output_address (GET_MODE (x), XEXP (x, 0));
4688 break;
4690 case CONST:
4691 case LABEL_REF:
4692 case SYMBOL_REF:
4693 output_addr_const (asm_out_file, x);
4694 break;
4696 case CONST_INT:
4697 asm_fprintf (f, "%wd", INTVAL (x));
4698 break;
4700 case CONST_VECTOR:
4701 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4703 gcc_assert (
4704 aarch64_const_vec_all_same_in_range_p (x,
4705 HOST_WIDE_INT_MIN,
4706 HOST_WIDE_INT_MAX));
4707 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4709 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4711 fputc ('0', f);
4713 else
4714 gcc_unreachable ();
4715 break;
4717 case CONST_DOUBLE:
4718 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4719 be getting CONST_DOUBLEs holding integers. */
4720 gcc_assert (GET_MODE (x) != VOIDmode);
4721 if (aarch64_float_const_zero_rtx_p (x))
4723 fputc ('0', f);
4724 break;
4726 else if (aarch64_float_const_representable_p (x))
4728 #define buf_size 20
4729 char float_buf[buf_size] = {'\0'};
4730 real_to_decimal_for_mode (float_buf,
4731 CONST_DOUBLE_REAL_VALUE (x),
4732 buf_size, buf_size,
4733 1, GET_MODE (x));
4734 asm_fprintf (asm_out_file, "%s", float_buf);
4735 break;
4736 #undef buf_size
4738 output_operand_lossage ("invalid constant");
4739 return;
4740 default:
4741 output_operand_lossage ("invalid operand");
4742 return;
4744 break;
4746 case 'A':
4747 if (GET_CODE (x) == HIGH)
4748 x = XEXP (x, 0);
4750 switch (aarch64_classify_symbolic_expression (x))
4752 case SYMBOL_SMALL_GOT_4G:
4753 asm_fprintf (asm_out_file, ":got:");
4754 break;
4756 case SYMBOL_SMALL_TLSGD:
4757 asm_fprintf (asm_out_file, ":tlsgd:");
4758 break;
4760 case SYMBOL_SMALL_TLSDESC:
4761 asm_fprintf (asm_out_file, ":tlsdesc:");
4762 break;
4764 case SYMBOL_SMALL_TLSIE:
4765 asm_fprintf (asm_out_file, ":gottprel:");
4766 break;
4768 case SYMBOL_TLSLE24:
4769 asm_fprintf (asm_out_file, ":tprel:");
4770 break;
4772 case SYMBOL_TINY_GOT:
4773 gcc_unreachable ();
4774 break;
4776 default:
4777 break;
4779 output_addr_const (asm_out_file, x);
4780 break;
4782 case 'L':
4783 switch (aarch64_classify_symbolic_expression (x))
4785 case SYMBOL_SMALL_GOT_4G:
4786 asm_fprintf (asm_out_file, ":lo12:");
4787 break;
4789 case SYMBOL_SMALL_TLSGD:
4790 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4791 break;
4793 case SYMBOL_SMALL_TLSDESC:
4794 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4795 break;
4797 case SYMBOL_SMALL_TLSIE:
4798 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4799 break;
4801 case SYMBOL_TLSLE12:
4802 asm_fprintf (asm_out_file, ":tprel_lo12:");
4803 break;
4805 case SYMBOL_TLSLE24:
4806 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4807 break;
4809 case SYMBOL_TINY_GOT:
4810 asm_fprintf (asm_out_file, ":got:");
4811 break;
4813 case SYMBOL_TINY_TLSIE:
4814 asm_fprintf (asm_out_file, ":gottprel:");
4815 break;
4817 default:
4818 break;
4820 output_addr_const (asm_out_file, x);
4821 break;
4823 case 'G':
4825 switch (aarch64_classify_symbolic_expression (x))
4827 case SYMBOL_TLSLE24:
4828 asm_fprintf (asm_out_file, ":tprel_hi12:");
4829 break;
4830 default:
4831 break;
4833 output_addr_const (asm_out_file, x);
4834 break;
4836 case 'k':
4838 HOST_WIDE_INT cond_code;
4839 /* Print nzcv. */
4841 if (!CONST_INT_P (x))
4843 output_operand_lossage ("invalid operand for '%%%c'", code);
4844 return;
4847 cond_code = INTVAL (x);
4848 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4849 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4851 break;
4853 default:
4854 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4855 return;
4859 static void
4860 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4862 struct aarch64_address_info addr;
4864 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4865 switch (addr.type)
4867 case ADDRESS_REG_IMM:
4868 if (addr.offset == const0_rtx)
4869 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4870 else
4871 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4872 INTVAL (addr.offset));
4873 return;
4875 case ADDRESS_REG_REG:
4876 if (addr.shift == 0)
4877 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4878 reg_names [REGNO (addr.offset)]);
4879 else
4880 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4881 reg_names [REGNO (addr.offset)], addr.shift);
4882 return;
4884 case ADDRESS_REG_UXTW:
4885 if (addr.shift == 0)
4886 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4887 REGNO (addr.offset) - R0_REGNUM);
4888 else
4889 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4890 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4891 return;
4893 case ADDRESS_REG_SXTW:
4894 if (addr.shift == 0)
4895 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4896 REGNO (addr.offset) - R0_REGNUM);
4897 else
4898 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4899 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4900 return;
4902 case ADDRESS_REG_WB:
4903 switch (GET_CODE (x))
4905 case PRE_INC:
4906 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4907 GET_MODE_SIZE (mode));
4908 return;
4909 case POST_INC:
4910 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4911 GET_MODE_SIZE (mode));
4912 return;
4913 case PRE_DEC:
4914 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4915 GET_MODE_SIZE (mode));
4916 return;
4917 case POST_DEC:
4918 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4919 GET_MODE_SIZE (mode));
4920 return;
4921 case PRE_MODIFY:
4922 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4923 INTVAL (addr.offset));
4924 return;
4925 case POST_MODIFY:
4926 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4927 INTVAL (addr.offset));
4928 return;
4929 default:
4930 break;
4932 break;
4934 case ADDRESS_LO_SUM:
4935 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4936 output_addr_const (f, addr.offset);
4937 asm_fprintf (f, "]");
4938 return;
4940 case ADDRESS_SYMBOLIC:
4941 break;
4944 output_addr_const (f, x);
4947 bool
4948 aarch64_label_mentioned_p (rtx x)
4950 const char *fmt;
4951 int i;
4953 if (GET_CODE (x) == LABEL_REF)
4954 return true;
4956 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4957 referencing instruction, but they are constant offsets, not
4958 symbols. */
4959 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4960 return false;
4962 fmt = GET_RTX_FORMAT (GET_CODE (x));
4963 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4965 if (fmt[i] == 'E')
4967 int j;
4969 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4970 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4971 return 1;
4973 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4974 return 1;
4977 return 0;
4980 /* Implement REGNO_REG_CLASS. */
4982 enum reg_class
4983 aarch64_regno_regclass (unsigned regno)
4985 if (GP_REGNUM_P (regno))
4986 return GENERAL_REGS;
4988 if (regno == SP_REGNUM)
4989 return STACK_REG;
4991 if (regno == FRAME_POINTER_REGNUM
4992 || regno == ARG_POINTER_REGNUM)
4993 return POINTER_REGS;
4995 if (FP_REGNUM_P (regno))
4996 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4998 return NO_REGS;
5001 static rtx
5002 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5004 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5005 where mask is selected by alignment and size of the offset.
5006 We try to pick as large a range for the offset as possible to
5007 maximize the chance of a CSE. However, for aligned addresses
5008 we limit the range to 4k so that structures with different sized
5009 elements are likely to use the same base. We need to be careful
5010 not to split a CONST for some forms of address expression, otherwise
5011 it will generate sub-optimal code. */
5013 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5015 rtx base = XEXP (x, 0);
5016 rtx offset_rtx = XEXP (x, 1);
5017 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5019 if (GET_CODE (base) == PLUS)
5021 rtx op0 = XEXP (base, 0);
5022 rtx op1 = XEXP (base, 1);
5024 /* Force any scaling into a temp for CSE. */
5025 op0 = force_reg (Pmode, op0);
5026 op1 = force_reg (Pmode, op1);
5028 /* Let the pointer register be in op0. */
5029 if (REG_POINTER (op1))
5030 std::swap (op0, op1);
5032 /* If the pointer is virtual or frame related, then we know that
5033 virtual register instantiation or register elimination is going
5034 to apply a second constant. We want the two constants folded
5035 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5036 if (virt_or_elim_regno_p (REGNO (op0)))
5038 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5039 NULL_RTX, true, OPTAB_DIRECT);
5040 return gen_rtx_PLUS (Pmode, base, op1);
5043 /* Otherwise, in order to encourage CSE (and thence loop strength
5044 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5045 base = expand_binop (Pmode, add_optab, op0, op1,
5046 NULL_RTX, true, OPTAB_DIRECT);
5047 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5050 /* Does it look like we'll need a load/store-pair operation? */
5051 HOST_WIDE_INT base_offset;
5052 if (GET_MODE_SIZE (mode) > 16
5053 || mode == TImode)
5054 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5055 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5056 /* For offsets aren't a multiple of the access size, the limit is
5057 -256...255. */
5058 else if (offset & (GET_MODE_SIZE (mode) - 1))
5059 base_offset = (offset + 0x100) & ~0x1ff;
5060 else
5061 base_offset = offset & ~0xfff;
5063 if (base_offset != 0)
5065 base = plus_constant (Pmode, base, base_offset);
5066 base = force_operand (base, NULL_RTX);
5067 return plus_constant (Pmode, base, offset - base_offset);
5071 return x;
5074 /* Return the reload icode required for a constant pool in mode. */
5075 static enum insn_code
5076 aarch64_constant_pool_reload_icode (machine_mode mode)
5078 switch (mode)
5080 case SFmode:
5081 return CODE_FOR_aarch64_reload_movcpsfdi;
5083 case DFmode:
5084 return CODE_FOR_aarch64_reload_movcpdfdi;
5086 case TFmode:
5087 return CODE_FOR_aarch64_reload_movcptfdi;
5089 case V8QImode:
5090 return CODE_FOR_aarch64_reload_movcpv8qidi;
5092 case V16QImode:
5093 return CODE_FOR_aarch64_reload_movcpv16qidi;
5095 case V4HImode:
5096 return CODE_FOR_aarch64_reload_movcpv4hidi;
5098 case V8HImode:
5099 return CODE_FOR_aarch64_reload_movcpv8hidi;
5101 case V2SImode:
5102 return CODE_FOR_aarch64_reload_movcpv2sidi;
5104 case V4SImode:
5105 return CODE_FOR_aarch64_reload_movcpv4sidi;
5107 case V2DImode:
5108 return CODE_FOR_aarch64_reload_movcpv2didi;
5110 case V2DFmode:
5111 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5113 default:
5114 gcc_unreachable ();
5117 gcc_unreachable ();
5119 static reg_class_t
5120 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5121 reg_class_t rclass,
5122 machine_mode mode,
5123 secondary_reload_info *sri)
5126 /* If we have to disable direct literal pool loads and stores because the
5127 function is too big, then we need a scratch register. */
5128 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5129 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5130 || targetm.vector_mode_supported_p (GET_MODE (x)))
5131 && !aarch64_pcrelative_literal_loads)
5133 sri->icode = aarch64_constant_pool_reload_icode (mode);
5134 return NO_REGS;
5137 /* Without the TARGET_SIMD instructions we cannot move a Q register
5138 to a Q register directly. We need a scratch. */
5139 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5140 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5141 && reg_class_subset_p (rclass, FP_REGS))
5143 if (mode == TFmode)
5144 sri->icode = CODE_FOR_aarch64_reload_movtf;
5145 else if (mode == TImode)
5146 sri->icode = CODE_FOR_aarch64_reload_movti;
5147 return NO_REGS;
5150 /* A TFmode or TImode memory access should be handled via an FP_REGS
5151 because AArch64 has richer addressing modes for LDR/STR instructions
5152 than LDP/STP instructions. */
5153 if (TARGET_FLOAT && rclass == GENERAL_REGS
5154 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5155 return FP_REGS;
5157 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5158 return GENERAL_REGS;
5160 return NO_REGS;
5163 static bool
5164 aarch64_can_eliminate (const int from, const int to)
5166 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5167 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5169 if (frame_pointer_needed)
5171 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5172 return true;
5173 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5174 return false;
5175 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5176 && !cfun->calls_alloca)
5177 return true;
5178 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5179 return true;
5181 return false;
5183 else
5185 /* If we decided that we didn't need a leaf frame pointer but then used
5186 LR in the function, then we'll want a frame pointer after all, so
5187 prevent this elimination to ensure a frame pointer is used. */
5188 if (to == STACK_POINTER_REGNUM
5189 && flag_omit_leaf_frame_pointer
5190 && df_regs_ever_live_p (LR_REGNUM))
5191 return false;
5194 return true;
5197 HOST_WIDE_INT
5198 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5200 aarch64_layout_frame ();
5202 if (to == HARD_FRAME_POINTER_REGNUM)
5204 if (from == ARG_POINTER_REGNUM)
5205 return cfun->machine->frame.hard_fp_offset;
5207 if (from == FRAME_POINTER_REGNUM)
5208 return cfun->machine->frame.hard_fp_offset
5209 - cfun->machine->frame.locals_offset;
5212 if (to == STACK_POINTER_REGNUM)
5214 if (from == FRAME_POINTER_REGNUM)
5215 return cfun->machine->frame.frame_size
5216 - cfun->machine->frame.locals_offset;
5219 return cfun->machine->frame.frame_size;
5222 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5223 previous frame. */
5226 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5228 if (count != 0)
5229 return const0_rtx;
5230 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5234 static void
5235 aarch64_asm_trampoline_template (FILE *f)
5237 if (TARGET_ILP32)
5239 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5240 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5242 else
5244 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5245 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5247 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5248 assemble_aligned_integer (4, const0_rtx);
5249 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5250 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5253 static void
5254 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5256 rtx fnaddr, mem, a_tramp;
5257 const int tramp_code_sz = 16;
5259 /* Don't need to copy the trailing D-words, we fill those in below. */
5260 emit_block_move (m_tramp, assemble_trampoline_template (),
5261 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5262 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5263 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5264 if (GET_MODE (fnaddr) != ptr_mode)
5265 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5266 emit_move_insn (mem, fnaddr);
5268 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5269 emit_move_insn (mem, chain_value);
5271 /* XXX We should really define a "clear_cache" pattern and use
5272 gen_clear_cache(). */
5273 a_tramp = XEXP (m_tramp, 0);
5274 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5275 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5276 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5277 ptr_mode);
5280 static unsigned char
5281 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5283 switch (regclass)
5285 case CALLER_SAVE_REGS:
5286 case POINTER_REGS:
5287 case GENERAL_REGS:
5288 case ALL_REGS:
5289 case FP_REGS:
5290 case FP_LO_REGS:
5291 return
5292 aarch64_vector_mode_p (mode)
5293 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5294 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5295 case STACK_REG:
5296 return 1;
5298 case NO_REGS:
5299 return 0;
5301 default:
5302 break;
5304 gcc_unreachable ();
5307 static reg_class_t
5308 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5310 if (regclass == POINTER_REGS)
5311 return GENERAL_REGS;
5313 if (regclass == STACK_REG)
5315 if (REG_P(x)
5316 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5317 return regclass;
5319 return NO_REGS;
5322 /* If it's an integer immediate that MOVI can't handle, then
5323 FP_REGS is not an option, so we return NO_REGS instead. */
5324 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5325 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5326 return NO_REGS;
5328 /* Register eliminiation can result in a request for
5329 SP+constant->FP_REGS. We cannot support such operations which
5330 use SP as source and an FP_REG as destination, so reject out
5331 right now. */
5332 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5334 rtx lhs = XEXP (x, 0);
5336 /* Look through a possible SUBREG introduced by ILP32. */
5337 if (GET_CODE (lhs) == SUBREG)
5338 lhs = SUBREG_REG (lhs);
5340 gcc_assert (REG_P (lhs));
5341 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5342 POINTER_REGS));
5343 return NO_REGS;
5346 return regclass;
5349 void
5350 aarch64_asm_output_labelref (FILE* f, const char *name)
5352 asm_fprintf (f, "%U%s", name);
5355 static void
5356 aarch64_elf_asm_constructor (rtx symbol, int priority)
5358 if (priority == DEFAULT_INIT_PRIORITY)
5359 default_ctor_section_asm_out_constructor (symbol, priority);
5360 else
5362 section *s;
5363 char buf[18];
5364 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5365 s = get_section (buf, SECTION_WRITE, NULL);
5366 switch_to_section (s);
5367 assemble_align (POINTER_SIZE);
5368 assemble_aligned_integer (POINTER_BYTES, symbol);
5372 static void
5373 aarch64_elf_asm_destructor (rtx symbol, int priority)
5375 if (priority == DEFAULT_INIT_PRIORITY)
5376 default_dtor_section_asm_out_destructor (symbol, priority);
5377 else
5379 section *s;
5380 char buf[18];
5381 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5382 s = get_section (buf, SECTION_WRITE, NULL);
5383 switch_to_section (s);
5384 assemble_align (POINTER_SIZE);
5385 assemble_aligned_integer (POINTER_BYTES, symbol);
5389 const char*
5390 aarch64_output_casesi (rtx *operands)
5392 char buf[100];
5393 char label[100];
5394 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5395 int index;
5396 static const char *const patterns[4][2] =
5399 "ldrb\t%w3, [%0,%w1,uxtw]",
5400 "add\t%3, %4, %w3, sxtb #2"
5403 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5404 "add\t%3, %4, %w3, sxth #2"
5407 "ldr\t%w3, [%0,%w1,uxtw #2]",
5408 "add\t%3, %4, %w3, sxtw #2"
5410 /* We assume that DImode is only generated when not optimizing and
5411 that we don't really need 64-bit address offsets. That would
5412 imply an object file with 8GB of code in a single function! */
5414 "ldr\t%w3, [%0,%w1,uxtw #2]",
5415 "add\t%3, %4, %w3, sxtw #2"
5419 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5421 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5423 gcc_assert (index >= 0 && index <= 3);
5425 /* Need to implement table size reduction, by chaning the code below. */
5426 output_asm_insn (patterns[index][0], operands);
5427 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5428 snprintf (buf, sizeof (buf),
5429 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5430 output_asm_insn (buf, operands);
5431 output_asm_insn (patterns[index][1], operands);
5432 output_asm_insn ("br\t%3", operands);
5433 assemble_label (asm_out_file, label);
5434 return "";
5438 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5439 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5440 operator. */
5443 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5445 if (shift >= 0 && shift <= 3)
5447 int size;
5448 for (size = 8; size <= 32; size *= 2)
5450 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5451 if (mask == bits << shift)
5452 return size;
5455 return 0;
5458 /* Constant pools are per function only when PC relative
5459 literal loads are true or we are in the large memory
5460 model. */
5462 static inline bool
5463 aarch64_can_use_per_function_literal_pools_p (void)
5465 return (aarch64_pcrelative_literal_loads
5466 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5469 static bool
5470 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5472 /* Fixme:: In an ideal world this would work similar
5473 to the logic in aarch64_select_rtx_section but this
5474 breaks bootstrap in gcc go. For now we workaround
5475 this by returning false here. */
5476 return false;
5479 /* Select appropriate section for constants depending
5480 on where we place literal pools. */
5482 static section *
5483 aarch64_select_rtx_section (machine_mode mode,
5484 rtx x,
5485 unsigned HOST_WIDE_INT align)
5487 if (aarch64_can_use_per_function_literal_pools_p ())
5488 return function_section (current_function_decl);
5490 return default_elf_select_rtx_section (mode, x, align);
5493 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5494 void
5495 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5496 HOST_WIDE_INT offset)
5498 /* When using per-function literal pools, we must ensure that any code
5499 section is aligned to the minimal instruction length, lest we get
5500 errors from the assembler re "unaligned instructions". */
5501 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5502 ASM_OUTPUT_ALIGN (f, 2);
5505 /* Costs. */
5507 /* Helper function for rtx cost calculation. Strip a shift expression
5508 from X. Returns the inner operand if successful, or the original
5509 expression on failure. */
5510 static rtx
5511 aarch64_strip_shift (rtx x)
5513 rtx op = x;
5515 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5516 we can convert both to ROR during final output. */
5517 if ((GET_CODE (op) == ASHIFT
5518 || GET_CODE (op) == ASHIFTRT
5519 || GET_CODE (op) == LSHIFTRT
5520 || GET_CODE (op) == ROTATERT
5521 || GET_CODE (op) == ROTATE)
5522 && CONST_INT_P (XEXP (op, 1)))
5523 return XEXP (op, 0);
5525 if (GET_CODE (op) == MULT
5526 && CONST_INT_P (XEXP (op, 1))
5527 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5528 return XEXP (op, 0);
5530 return x;
5533 /* Helper function for rtx cost calculation. Strip an extend
5534 expression from X. Returns the inner operand if successful, or the
5535 original expression on failure. We deal with a number of possible
5536 canonicalization variations here. */
5537 static rtx
5538 aarch64_strip_extend (rtx x)
5540 rtx op = x;
5542 /* Zero and sign extraction of a widened value. */
5543 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5544 && XEXP (op, 2) == const0_rtx
5545 && GET_CODE (XEXP (op, 0)) == MULT
5546 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5547 XEXP (op, 1)))
5548 return XEXP (XEXP (op, 0), 0);
5550 /* It can also be represented (for zero-extend) as an AND with an
5551 immediate. */
5552 if (GET_CODE (op) == AND
5553 && GET_CODE (XEXP (op, 0)) == MULT
5554 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5555 && CONST_INT_P (XEXP (op, 1))
5556 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5557 INTVAL (XEXP (op, 1))) != 0)
5558 return XEXP (XEXP (op, 0), 0);
5560 /* Now handle extended register, as this may also have an optional
5561 left shift by 1..4. */
5562 if (GET_CODE (op) == ASHIFT
5563 && CONST_INT_P (XEXP (op, 1))
5564 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5565 op = XEXP (op, 0);
5567 if (GET_CODE (op) == ZERO_EXTEND
5568 || GET_CODE (op) == SIGN_EXTEND)
5569 op = XEXP (op, 0);
5571 if (op != x)
5572 return op;
5574 return x;
5577 /* Return true iff CODE is a shift supported in combination
5578 with arithmetic instructions. */
5580 static bool
5581 aarch64_shift_p (enum rtx_code code)
5583 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5586 /* Helper function for rtx cost calculation. Calculate the cost of
5587 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5588 Return the calculated cost of the expression, recursing manually in to
5589 operands where needed. */
5591 static int
5592 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5594 rtx op0, op1;
5595 const struct cpu_cost_table *extra_cost
5596 = aarch64_tune_params.insn_extra_cost;
5597 int cost = 0;
5598 bool compound_p = (outer == PLUS || outer == MINUS);
5599 machine_mode mode = GET_MODE (x);
5601 gcc_checking_assert (code == MULT);
5603 op0 = XEXP (x, 0);
5604 op1 = XEXP (x, 1);
5606 if (VECTOR_MODE_P (mode))
5607 mode = GET_MODE_INNER (mode);
5609 /* Integer multiply/fma. */
5610 if (GET_MODE_CLASS (mode) == MODE_INT)
5612 /* The multiply will be canonicalized as a shift, cost it as such. */
5613 if (aarch64_shift_p (GET_CODE (x))
5614 || (CONST_INT_P (op1)
5615 && exact_log2 (INTVAL (op1)) > 0))
5617 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5618 || GET_CODE (op0) == SIGN_EXTEND;
5619 if (speed)
5621 if (compound_p)
5623 if (REG_P (op1))
5624 /* ARITH + shift-by-register. */
5625 cost += extra_cost->alu.arith_shift_reg;
5626 else if (is_extend)
5627 /* ARITH + extended register. We don't have a cost field
5628 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5629 cost += extra_cost->alu.extend_arith;
5630 else
5631 /* ARITH + shift-by-immediate. */
5632 cost += extra_cost->alu.arith_shift;
5634 else
5635 /* LSL (immediate). */
5636 cost += extra_cost->alu.shift;
5639 /* Strip extends as we will have costed them in the case above. */
5640 if (is_extend)
5641 op0 = aarch64_strip_extend (op0);
5643 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5645 return cost;
5648 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5649 compound and let the below cases handle it. After all, MNEG is a
5650 special-case alias of MSUB. */
5651 if (GET_CODE (op0) == NEG)
5653 op0 = XEXP (op0, 0);
5654 compound_p = true;
5657 /* Integer multiplies or FMAs have zero/sign extending variants. */
5658 if ((GET_CODE (op0) == ZERO_EXTEND
5659 && GET_CODE (op1) == ZERO_EXTEND)
5660 || (GET_CODE (op0) == SIGN_EXTEND
5661 && GET_CODE (op1) == SIGN_EXTEND))
5663 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5664 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5666 if (speed)
5668 if (compound_p)
5669 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5670 cost += extra_cost->mult[0].extend_add;
5671 else
5672 /* MUL/SMULL/UMULL. */
5673 cost += extra_cost->mult[0].extend;
5676 return cost;
5679 /* This is either an integer multiply or a MADD. In both cases
5680 we want to recurse and cost the operands. */
5681 cost += rtx_cost (op0, mode, MULT, 0, speed);
5682 cost += rtx_cost (op1, mode, MULT, 1, speed);
5684 if (speed)
5686 if (compound_p)
5687 /* MADD/MSUB. */
5688 cost += extra_cost->mult[mode == DImode].add;
5689 else
5690 /* MUL. */
5691 cost += extra_cost->mult[mode == DImode].simple;
5694 return cost;
5696 else
5698 if (speed)
5700 /* Floating-point FMA/FMUL can also support negations of the
5701 operands, unless the rounding mode is upward or downward in
5702 which case FNMUL is different than FMUL with operand negation. */
5703 bool neg0 = GET_CODE (op0) == NEG;
5704 bool neg1 = GET_CODE (op1) == NEG;
5705 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5707 if (neg0)
5708 op0 = XEXP (op0, 0);
5709 if (neg1)
5710 op1 = XEXP (op1, 0);
5713 if (compound_p)
5714 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5715 cost += extra_cost->fp[mode == DFmode].fma;
5716 else
5717 /* FMUL/FNMUL. */
5718 cost += extra_cost->fp[mode == DFmode].mult;
5721 cost += rtx_cost (op0, mode, MULT, 0, speed);
5722 cost += rtx_cost (op1, mode, MULT, 1, speed);
5723 return cost;
5727 static int
5728 aarch64_address_cost (rtx x,
5729 machine_mode mode,
5730 addr_space_t as ATTRIBUTE_UNUSED,
5731 bool speed)
5733 enum rtx_code c = GET_CODE (x);
5734 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5735 struct aarch64_address_info info;
5736 int cost = 0;
5737 info.shift = 0;
5739 if (!aarch64_classify_address (&info, x, mode, c, false))
5741 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5743 /* This is a CONST or SYMBOL ref which will be split
5744 in a different way depending on the code model in use.
5745 Cost it through the generic infrastructure. */
5746 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5747 /* Divide through by the cost of one instruction to
5748 bring it to the same units as the address costs. */
5749 cost_symbol_ref /= COSTS_N_INSNS (1);
5750 /* The cost is then the cost of preparing the address,
5751 followed by an immediate (possibly 0) offset. */
5752 return cost_symbol_ref + addr_cost->imm_offset;
5754 else
5756 /* This is most likely a jump table from a case
5757 statement. */
5758 return addr_cost->register_offset;
5762 switch (info.type)
5764 case ADDRESS_LO_SUM:
5765 case ADDRESS_SYMBOLIC:
5766 case ADDRESS_REG_IMM:
5767 cost += addr_cost->imm_offset;
5768 break;
5770 case ADDRESS_REG_WB:
5771 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5772 cost += addr_cost->pre_modify;
5773 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5774 cost += addr_cost->post_modify;
5775 else
5776 gcc_unreachable ();
5778 break;
5780 case ADDRESS_REG_REG:
5781 cost += addr_cost->register_offset;
5782 break;
5784 case ADDRESS_REG_SXTW:
5785 cost += addr_cost->register_sextend;
5786 break;
5788 case ADDRESS_REG_UXTW:
5789 cost += addr_cost->register_zextend;
5790 break;
5792 default:
5793 gcc_unreachable ();
5797 if (info.shift > 0)
5799 /* For the sake of calculating the cost of the shifted register
5800 component, we can treat same sized modes in the same way. */
5801 switch (GET_MODE_BITSIZE (mode))
5803 case 16:
5804 cost += addr_cost->addr_scale_costs.hi;
5805 break;
5807 case 32:
5808 cost += addr_cost->addr_scale_costs.si;
5809 break;
5811 case 64:
5812 cost += addr_cost->addr_scale_costs.di;
5813 break;
5815 /* We can't tell, or this is a 128-bit vector. */
5816 default:
5817 cost += addr_cost->addr_scale_costs.ti;
5818 break;
5822 return cost;
5825 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5826 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5827 to be taken. */
5830 aarch64_branch_cost (bool speed_p, bool predictable_p)
5832 /* When optimizing for speed, use the cost of unpredictable branches. */
5833 const struct cpu_branch_cost *branch_costs =
5834 aarch64_tune_params.branch_costs;
5836 if (!speed_p || predictable_p)
5837 return branch_costs->predictable;
5838 else
5839 return branch_costs->unpredictable;
5842 /* Return true if the RTX X in mode MODE is a zero or sign extract
5843 usable in an ADD or SUB (extended register) instruction. */
5844 static bool
5845 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5847 /* Catch add with a sign extract.
5848 This is add_<optab><mode>_multp2. */
5849 if (GET_CODE (x) == SIGN_EXTRACT
5850 || GET_CODE (x) == ZERO_EXTRACT)
5852 rtx op0 = XEXP (x, 0);
5853 rtx op1 = XEXP (x, 1);
5854 rtx op2 = XEXP (x, 2);
5856 if (GET_CODE (op0) == MULT
5857 && CONST_INT_P (op1)
5858 && op2 == const0_rtx
5859 && CONST_INT_P (XEXP (op0, 1))
5860 && aarch64_is_extend_from_extract (mode,
5861 XEXP (op0, 1),
5862 op1))
5864 return true;
5867 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5868 No shift. */
5869 else if (GET_CODE (x) == SIGN_EXTEND
5870 || GET_CODE (x) == ZERO_EXTEND)
5871 return REG_P (XEXP (x, 0));
5873 return false;
5876 static bool
5877 aarch64_frint_unspec_p (unsigned int u)
5879 switch (u)
5881 case UNSPEC_FRINTZ:
5882 case UNSPEC_FRINTP:
5883 case UNSPEC_FRINTM:
5884 case UNSPEC_FRINTA:
5885 case UNSPEC_FRINTN:
5886 case UNSPEC_FRINTX:
5887 case UNSPEC_FRINTI:
5888 return true;
5890 default:
5891 return false;
5895 /* Return true iff X is an rtx that will match an extr instruction
5896 i.e. as described in the *extr<mode>5_insn family of patterns.
5897 OP0 and OP1 will be set to the operands of the shifts involved
5898 on success and will be NULL_RTX otherwise. */
5900 static bool
5901 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5903 rtx op0, op1;
5904 machine_mode mode = GET_MODE (x);
5906 *res_op0 = NULL_RTX;
5907 *res_op1 = NULL_RTX;
5909 if (GET_CODE (x) != IOR)
5910 return false;
5912 op0 = XEXP (x, 0);
5913 op1 = XEXP (x, 1);
5915 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5916 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5918 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5919 if (GET_CODE (op1) == ASHIFT)
5920 std::swap (op0, op1);
5922 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5923 return false;
5925 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5926 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5928 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5929 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5931 *res_op0 = XEXP (op0, 0);
5932 *res_op1 = XEXP (op1, 0);
5933 return true;
5937 return false;
5940 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5941 storing it in *COST. Result is true if the total cost of the operation
5942 has now been calculated. */
5943 static bool
5944 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5946 rtx inner;
5947 rtx comparator;
5948 enum rtx_code cmpcode;
5950 if (COMPARISON_P (op0))
5952 inner = XEXP (op0, 0);
5953 comparator = XEXP (op0, 1);
5954 cmpcode = GET_CODE (op0);
5956 else
5958 inner = op0;
5959 comparator = const0_rtx;
5960 cmpcode = NE;
5963 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5965 /* Conditional branch. */
5966 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5967 return true;
5968 else
5970 if (cmpcode == NE || cmpcode == EQ)
5972 if (comparator == const0_rtx)
5974 /* TBZ/TBNZ/CBZ/CBNZ. */
5975 if (GET_CODE (inner) == ZERO_EXTRACT)
5976 /* TBZ/TBNZ. */
5977 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5978 ZERO_EXTRACT, 0, speed);
5979 else
5980 /* CBZ/CBNZ. */
5981 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5983 return true;
5986 else if (cmpcode == LT || cmpcode == GE)
5988 /* TBZ/TBNZ. */
5989 if (comparator == const0_rtx)
5990 return true;
5994 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5996 /* CCMP. */
5997 if (GET_CODE (op1) == COMPARE)
5999 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6000 if (XEXP (op1, 1) == const0_rtx)
6001 *cost += 1;
6002 if (speed)
6004 machine_mode mode = GET_MODE (XEXP (op1, 0));
6005 const struct cpu_cost_table *extra_cost
6006 = aarch64_tune_params.insn_extra_cost;
6008 if (GET_MODE_CLASS (mode) == MODE_INT)
6009 *cost += extra_cost->alu.arith;
6010 else
6011 *cost += extra_cost->fp[mode == DFmode].compare;
6013 return true;
6016 /* It's a conditional operation based on the status flags,
6017 so it must be some flavor of CSEL. */
6019 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6020 if (GET_CODE (op1) == NEG
6021 || GET_CODE (op1) == NOT
6022 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6023 op1 = XEXP (op1, 0);
6024 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6026 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6027 op1 = XEXP (op1, 0);
6028 op2 = XEXP (op2, 0);
6031 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6032 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6033 return true;
6036 /* We don't know what this is, cost all operands. */
6037 return false;
6040 /* Check whether X is a bitfield operation of the form shift + extend that
6041 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6042 operand to which the bitfield operation is applied. Otherwise return
6043 NULL_RTX. */
6045 static rtx
6046 aarch64_extend_bitfield_pattern_p (rtx x)
6048 rtx_code outer_code = GET_CODE (x);
6049 machine_mode outer_mode = GET_MODE (x);
6051 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6052 && outer_mode != SImode && outer_mode != DImode)
6053 return NULL_RTX;
6055 rtx inner = XEXP (x, 0);
6056 rtx_code inner_code = GET_CODE (inner);
6057 machine_mode inner_mode = GET_MODE (inner);
6058 rtx op = NULL_RTX;
6060 switch (inner_code)
6062 case ASHIFT:
6063 if (CONST_INT_P (XEXP (inner, 1))
6064 && (inner_mode == QImode || inner_mode == HImode))
6065 op = XEXP (inner, 0);
6066 break;
6067 case LSHIFTRT:
6068 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6069 && (inner_mode == QImode || inner_mode == HImode))
6070 op = XEXP (inner, 0);
6071 break;
6072 case ASHIFTRT:
6073 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6074 && (inner_mode == QImode || inner_mode == HImode))
6075 op = XEXP (inner, 0);
6076 break;
6077 default:
6078 break;
6081 return op;
6084 /* Return true if the mask and a shift amount from an RTX of the form
6085 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6086 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6088 bool
6089 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6091 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6092 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6093 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6094 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6097 /* Calculate the cost of calculating X, storing it in *COST. Result
6098 is true if the total cost of the operation has now been calculated. */
6099 static bool
6100 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6101 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6103 rtx op0, op1, op2;
6104 const struct cpu_cost_table *extra_cost
6105 = aarch64_tune_params.insn_extra_cost;
6106 int code = GET_CODE (x);
6108 /* By default, assume that everything has equivalent cost to the
6109 cheapest instruction. Any additional costs are applied as a delta
6110 above this default. */
6111 *cost = COSTS_N_INSNS (1);
6113 switch (code)
6115 case SET:
6116 /* The cost depends entirely on the operands to SET. */
6117 *cost = 0;
6118 op0 = SET_DEST (x);
6119 op1 = SET_SRC (x);
6121 switch (GET_CODE (op0))
6123 case MEM:
6124 if (speed)
6126 rtx address = XEXP (op0, 0);
6127 if (VECTOR_MODE_P (mode))
6128 *cost += extra_cost->ldst.storev;
6129 else if (GET_MODE_CLASS (mode) == MODE_INT)
6130 *cost += extra_cost->ldst.store;
6131 else if (mode == SFmode)
6132 *cost += extra_cost->ldst.storef;
6133 else if (mode == DFmode)
6134 *cost += extra_cost->ldst.stored;
6136 *cost +=
6137 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6138 0, speed));
6141 *cost += rtx_cost (op1, mode, SET, 1, speed);
6142 return true;
6144 case SUBREG:
6145 if (! REG_P (SUBREG_REG (op0)))
6146 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6148 /* Fall through. */
6149 case REG:
6150 /* The cost is one per vector-register copied. */
6151 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6153 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6154 / GET_MODE_SIZE (V4SImode);
6155 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6157 /* const0_rtx is in general free, but we will use an
6158 instruction to set a register to 0. */
6159 else if (REG_P (op1) || op1 == const0_rtx)
6161 /* The cost is 1 per register copied. */
6162 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6163 / UNITS_PER_WORD;
6164 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6166 else
6167 /* Cost is just the cost of the RHS of the set. */
6168 *cost += rtx_cost (op1, mode, SET, 1, speed);
6169 return true;
6171 case ZERO_EXTRACT:
6172 case SIGN_EXTRACT:
6173 /* Bit-field insertion. Strip any redundant widening of
6174 the RHS to meet the width of the target. */
6175 if (GET_CODE (op1) == SUBREG)
6176 op1 = SUBREG_REG (op1);
6177 if ((GET_CODE (op1) == ZERO_EXTEND
6178 || GET_CODE (op1) == SIGN_EXTEND)
6179 && CONST_INT_P (XEXP (op0, 1))
6180 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6181 >= INTVAL (XEXP (op0, 1))))
6182 op1 = XEXP (op1, 0);
6184 if (CONST_INT_P (op1))
6186 /* MOV immediate is assumed to always be cheap. */
6187 *cost = COSTS_N_INSNS (1);
6189 else
6191 /* BFM. */
6192 if (speed)
6193 *cost += extra_cost->alu.bfi;
6194 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6197 return true;
6199 default:
6200 /* We can't make sense of this, assume default cost. */
6201 *cost = COSTS_N_INSNS (1);
6202 return false;
6204 return false;
6206 case CONST_INT:
6207 /* If an instruction can incorporate a constant within the
6208 instruction, the instruction's expression avoids calling
6209 rtx_cost() on the constant. If rtx_cost() is called on a
6210 constant, then it is usually because the constant must be
6211 moved into a register by one or more instructions.
6213 The exception is constant 0, which can be expressed
6214 as XZR/WZR and is therefore free. The exception to this is
6215 if we have (set (reg) (const0_rtx)) in which case we must cost
6216 the move. However, we can catch that when we cost the SET, so
6217 we don't need to consider that here. */
6218 if (x == const0_rtx)
6219 *cost = 0;
6220 else
6222 /* To an approximation, building any other constant is
6223 proportionally expensive to the number of instructions
6224 required to build that constant. This is true whether we
6225 are compiling for SPEED or otherwise. */
6226 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6227 (NULL_RTX, x, false, mode));
6229 return true;
6231 case CONST_DOUBLE:
6232 if (speed)
6234 /* mov[df,sf]_aarch64. */
6235 if (aarch64_float_const_representable_p (x))
6236 /* FMOV (scalar immediate). */
6237 *cost += extra_cost->fp[mode == DFmode].fpconst;
6238 else if (!aarch64_float_const_zero_rtx_p (x))
6240 /* This will be a load from memory. */
6241 if (mode == DFmode)
6242 *cost += extra_cost->ldst.loadd;
6243 else
6244 *cost += extra_cost->ldst.loadf;
6246 else
6247 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6248 or MOV v0.s[0], wzr - neither of which are modeled by the
6249 cost tables. Just use the default cost. */
6254 return true;
6256 case MEM:
6257 if (speed)
6259 /* For loads we want the base cost of a load, plus an
6260 approximation for the additional cost of the addressing
6261 mode. */
6262 rtx address = XEXP (x, 0);
6263 if (VECTOR_MODE_P (mode))
6264 *cost += extra_cost->ldst.loadv;
6265 else if (GET_MODE_CLASS (mode) == MODE_INT)
6266 *cost += extra_cost->ldst.load;
6267 else if (mode == SFmode)
6268 *cost += extra_cost->ldst.loadf;
6269 else if (mode == DFmode)
6270 *cost += extra_cost->ldst.loadd;
6272 *cost +=
6273 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6274 0, speed));
6277 return true;
6279 case NEG:
6280 op0 = XEXP (x, 0);
6282 if (VECTOR_MODE_P (mode))
6284 if (speed)
6286 /* FNEG. */
6287 *cost += extra_cost->vect.alu;
6289 return false;
6292 if (GET_MODE_CLASS (mode) == MODE_INT)
6294 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6295 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6297 /* CSETM. */
6298 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6299 return true;
6302 /* Cost this as SUB wzr, X. */
6303 op0 = CONST0_RTX (mode);
6304 op1 = XEXP (x, 0);
6305 goto cost_minus;
6308 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6310 /* Support (neg(fma...)) as a single instruction only if
6311 sign of zeros is unimportant. This matches the decision
6312 making in aarch64.md. */
6313 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6315 /* FNMADD. */
6316 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6317 return true;
6319 if (GET_CODE (op0) == MULT)
6321 /* FNMUL. */
6322 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6323 return true;
6325 if (speed)
6326 /* FNEG. */
6327 *cost += extra_cost->fp[mode == DFmode].neg;
6328 return false;
6331 return false;
6333 case CLRSB:
6334 case CLZ:
6335 if (speed)
6337 if (VECTOR_MODE_P (mode))
6338 *cost += extra_cost->vect.alu;
6339 else
6340 *cost += extra_cost->alu.clz;
6343 return false;
6345 case COMPARE:
6346 op0 = XEXP (x, 0);
6347 op1 = XEXP (x, 1);
6349 if (op1 == const0_rtx
6350 && GET_CODE (op0) == AND)
6352 x = op0;
6353 mode = GET_MODE (op0);
6354 goto cost_logic;
6357 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6359 /* TODO: A write to the CC flags possibly costs extra, this
6360 needs encoding in the cost tables. */
6362 mode = GET_MODE (op0);
6363 /* ANDS. */
6364 if (GET_CODE (op0) == AND)
6366 x = op0;
6367 goto cost_logic;
6370 if (GET_CODE (op0) == PLUS)
6372 /* ADDS (and CMN alias). */
6373 x = op0;
6374 goto cost_plus;
6377 if (GET_CODE (op0) == MINUS)
6379 /* SUBS. */
6380 x = op0;
6381 goto cost_minus;
6384 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6385 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6386 && CONST_INT_P (XEXP (op0, 2)))
6388 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6389 Handle it here directly rather than going to cost_logic
6390 since we know the immediate generated for the TST is valid
6391 so we can avoid creating an intermediate rtx for it only
6392 for costing purposes. */
6393 if (speed)
6394 *cost += extra_cost->alu.logical;
6396 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6397 ZERO_EXTRACT, 0, speed);
6398 return true;
6401 if (GET_CODE (op1) == NEG)
6403 /* CMN. */
6404 if (speed)
6405 *cost += extra_cost->alu.arith;
6407 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6408 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6409 return true;
6412 /* CMP.
6414 Compare can freely swap the order of operands, and
6415 canonicalization puts the more complex operation first.
6416 But the integer MINUS logic expects the shift/extend
6417 operation in op1. */
6418 if (! (REG_P (op0)
6419 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6421 op0 = XEXP (x, 1);
6422 op1 = XEXP (x, 0);
6424 goto cost_minus;
6427 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6429 /* FCMP. */
6430 if (speed)
6431 *cost += extra_cost->fp[mode == DFmode].compare;
6433 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6435 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6436 /* FCMP supports constant 0.0 for no extra cost. */
6437 return true;
6439 return false;
6442 if (VECTOR_MODE_P (mode))
6444 /* Vector compare. */
6445 if (speed)
6446 *cost += extra_cost->vect.alu;
6448 if (aarch64_float_const_zero_rtx_p (op1))
6450 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6451 cost. */
6452 return true;
6454 return false;
6456 return false;
6458 case MINUS:
6460 op0 = XEXP (x, 0);
6461 op1 = XEXP (x, 1);
6463 cost_minus:
6464 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6466 /* Detect valid immediates. */
6467 if ((GET_MODE_CLASS (mode) == MODE_INT
6468 || (GET_MODE_CLASS (mode) == MODE_CC
6469 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6470 && CONST_INT_P (op1)
6471 && aarch64_uimm12_shift (INTVAL (op1)))
6473 if (speed)
6474 /* SUB(S) (immediate). */
6475 *cost += extra_cost->alu.arith;
6476 return true;
6479 /* Look for SUB (extended register). */
6480 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6482 if (speed)
6483 *cost += extra_cost->alu.extend_arith;
6485 op1 = aarch64_strip_extend (op1);
6486 *cost += rtx_cost (op1, VOIDmode,
6487 (enum rtx_code) GET_CODE (op1), 0, speed);
6488 return true;
6491 rtx new_op1 = aarch64_strip_extend (op1);
6493 /* Cost this as an FMA-alike operation. */
6494 if ((GET_CODE (new_op1) == MULT
6495 || aarch64_shift_p (GET_CODE (new_op1)))
6496 && code != COMPARE)
6498 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6499 (enum rtx_code) code,
6500 speed);
6501 return true;
6504 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6506 if (speed)
6508 if (VECTOR_MODE_P (mode))
6510 /* Vector SUB. */
6511 *cost += extra_cost->vect.alu;
6513 else if (GET_MODE_CLASS (mode) == MODE_INT)
6515 /* SUB(S). */
6516 *cost += extra_cost->alu.arith;
6518 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6520 /* FSUB. */
6521 *cost += extra_cost->fp[mode == DFmode].addsub;
6524 return true;
6527 case PLUS:
6529 rtx new_op0;
6531 op0 = XEXP (x, 0);
6532 op1 = XEXP (x, 1);
6534 cost_plus:
6535 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6536 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6538 /* CSINC. */
6539 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6540 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6541 return true;
6544 if (GET_MODE_CLASS (mode) == MODE_INT
6545 && CONST_INT_P (op1)
6546 && aarch64_uimm12_shift (INTVAL (op1)))
6548 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6550 if (speed)
6551 /* ADD (immediate). */
6552 *cost += extra_cost->alu.arith;
6553 return true;
6556 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6558 /* Look for ADD (extended register). */
6559 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6561 if (speed)
6562 *cost += extra_cost->alu.extend_arith;
6564 op0 = aarch64_strip_extend (op0);
6565 *cost += rtx_cost (op0, VOIDmode,
6566 (enum rtx_code) GET_CODE (op0), 0, speed);
6567 return true;
6570 /* Strip any extend, leave shifts behind as we will
6571 cost them through mult_cost. */
6572 new_op0 = aarch64_strip_extend (op0);
6574 if (GET_CODE (new_op0) == MULT
6575 || aarch64_shift_p (GET_CODE (new_op0)))
6577 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6578 speed);
6579 return true;
6582 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6584 if (speed)
6586 if (VECTOR_MODE_P (mode))
6588 /* Vector ADD. */
6589 *cost += extra_cost->vect.alu;
6591 else if (GET_MODE_CLASS (mode) == MODE_INT)
6593 /* ADD. */
6594 *cost += extra_cost->alu.arith;
6596 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6598 /* FADD. */
6599 *cost += extra_cost->fp[mode == DFmode].addsub;
6602 return true;
6605 case BSWAP:
6606 *cost = COSTS_N_INSNS (1);
6608 if (speed)
6610 if (VECTOR_MODE_P (mode))
6611 *cost += extra_cost->vect.alu;
6612 else
6613 *cost += extra_cost->alu.rev;
6615 return false;
6617 case IOR:
6618 if (aarch_rev16_p (x))
6620 *cost = COSTS_N_INSNS (1);
6622 if (speed)
6624 if (VECTOR_MODE_P (mode))
6625 *cost += extra_cost->vect.alu;
6626 else
6627 *cost += extra_cost->alu.rev;
6629 return true;
6632 if (aarch64_extr_rtx_p (x, &op0, &op1))
6634 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6635 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6636 if (speed)
6637 *cost += extra_cost->alu.shift;
6639 return true;
6641 /* Fall through. */
6642 case XOR:
6643 case AND:
6644 cost_logic:
6645 op0 = XEXP (x, 0);
6646 op1 = XEXP (x, 1);
6648 if (VECTOR_MODE_P (mode))
6650 if (speed)
6651 *cost += extra_cost->vect.alu;
6652 return true;
6655 if (code == AND
6656 && GET_CODE (op0) == MULT
6657 && CONST_INT_P (XEXP (op0, 1))
6658 && CONST_INT_P (op1)
6659 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6660 INTVAL (op1)) != 0)
6662 /* This is a UBFM/SBFM. */
6663 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6664 if (speed)
6665 *cost += extra_cost->alu.bfx;
6666 return true;
6669 if (GET_MODE_CLASS (mode) == MODE_INT)
6671 if (CONST_INT_P (op1))
6673 /* We have a mask + shift version of a UBFIZ
6674 i.e. the *andim_ashift<mode>_bfiz pattern. */
6675 if (GET_CODE (op0) == ASHIFT
6676 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6677 XEXP (op0, 1)))
6679 *cost += rtx_cost (XEXP (op0, 0), mode,
6680 (enum rtx_code) code, 0, speed);
6681 if (speed)
6682 *cost += extra_cost->alu.bfx;
6684 return true;
6686 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6688 /* We possibly get the immediate for free, this is not
6689 modelled. */
6690 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6691 if (speed)
6692 *cost += extra_cost->alu.logical;
6694 return true;
6697 else
6699 rtx new_op0 = op0;
6701 /* Handle ORN, EON, or BIC. */
6702 if (GET_CODE (op0) == NOT)
6703 op0 = XEXP (op0, 0);
6705 new_op0 = aarch64_strip_shift (op0);
6707 /* If we had a shift on op0 then this is a logical-shift-
6708 by-register/immediate operation. Otherwise, this is just
6709 a logical operation. */
6710 if (speed)
6712 if (new_op0 != op0)
6714 /* Shift by immediate. */
6715 if (CONST_INT_P (XEXP (op0, 1)))
6716 *cost += extra_cost->alu.log_shift;
6717 else
6718 *cost += extra_cost->alu.log_shift_reg;
6720 else
6721 *cost += extra_cost->alu.logical;
6724 /* In both cases we want to cost both operands. */
6725 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6726 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6728 return true;
6731 return false;
6733 case NOT:
6734 x = XEXP (x, 0);
6735 op0 = aarch64_strip_shift (x);
6737 if (VECTOR_MODE_P (mode))
6739 /* Vector NOT. */
6740 *cost += extra_cost->vect.alu;
6741 return false;
6744 /* MVN-shifted-reg. */
6745 if (op0 != x)
6747 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6749 if (speed)
6750 *cost += extra_cost->alu.log_shift;
6752 return true;
6754 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6755 Handle the second form here taking care that 'a' in the above can
6756 be a shift. */
6757 else if (GET_CODE (op0) == XOR)
6759 rtx newop0 = XEXP (op0, 0);
6760 rtx newop1 = XEXP (op0, 1);
6761 rtx op0_stripped = aarch64_strip_shift (newop0);
6763 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6764 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6766 if (speed)
6768 if (op0_stripped != newop0)
6769 *cost += extra_cost->alu.log_shift;
6770 else
6771 *cost += extra_cost->alu.logical;
6774 return true;
6776 /* MVN. */
6777 if (speed)
6778 *cost += extra_cost->alu.logical;
6780 return false;
6782 case ZERO_EXTEND:
6784 op0 = XEXP (x, 0);
6785 /* If a value is written in SI mode, then zero extended to DI
6786 mode, the operation will in general be free as a write to
6787 a 'w' register implicitly zeroes the upper bits of an 'x'
6788 register. However, if this is
6790 (set (reg) (zero_extend (reg)))
6792 we must cost the explicit register move. */
6793 if (mode == DImode
6794 && GET_MODE (op0) == SImode
6795 && outer == SET)
6797 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6799 /* If OP_COST is non-zero, then the cost of the zero extend
6800 is effectively the cost of the inner operation. Otherwise
6801 we have a MOV instruction and we take the cost from the MOV
6802 itself. This is true independently of whether we are
6803 optimizing for space or time. */
6804 if (op_cost)
6805 *cost = op_cost;
6807 return true;
6809 else if (MEM_P (op0))
6811 /* All loads can zero extend to any size for free. */
6812 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6813 return true;
6816 op0 = aarch64_extend_bitfield_pattern_p (x);
6817 if (op0)
6819 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6820 if (speed)
6821 *cost += extra_cost->alu.bfx;
6822 return true;
6825 if (speed)
6827 if (VECTOR_MODE_P (mode))
6829 /* UMOV. */
6830 *cost += extra_cost->vect.alu;
6832 else
6834 /* We generate an AND instead of UXTB/UXTH. */
6835 *cost += extra_cost->alu.logical;
6838 return false;
6840 case SIGN_EXTEND:
6841 if (MEM_P (XEXP (x, 0)))
6843 /* LDRSH. */
6844 if (speed)
6846 rtx address = XEXP (XEXP (x, 0), 0);
6847 *cost += extra_cost->ldst.load_sign_extend;
6849 *cost +=
6850 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6851 0, speed));
6853 return true;
6856 op0 = aarch64_extend_bitfield_pattern_p (x);
6857 if (op0)
6859 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6860 if (speed)
6861 *cost += extra_cost->alu.bfx;
6862 return true;
6865 if (speed)
6867 if (VECTOR_MODE_P (mode))
6868 *cost += extra_cost->vect.alu;
6869 else
6870 *cost += extra_cost->alu.extend;
6872 return false;
6874 case ASHIFT:
6875 op0 = XEXP (x, 0);
6876 op1 = XEXP (x, 1);
6878 if (CONST_INT_P (op1))
6880 if (speed)
6882 if (VECTOR_MODE_P (mode))
6884 /* Vector shift (immediate). */
6885 *cost += extra_cost->vect.alu;
6887 else
6889 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6890 aliases. */
6891 *cost += extra_cost->alu.shift;
6895 /* We can incorporate zero/sign extend for free. */
6896 if (GET_CODE (op0) == ZERO_EXTEND
6897 || GET_CODE (op0) == SIGN_EXTEND)
6898 op0 = XEXP (op0, 0);
6900 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6901 return true;
6903 else
6905 if (speed)
6907 if (VECTOR_MODE_P (mode))
6909 /* Vector shift (register). */
6910 *cost += extra_cost->vect.alu;
6912 else
6914 /* LSLV. */
6915 *cost += extra_cost->alu.shift_reg;
6918 return false; /* All arguments need to be in registers. */
6921 case ROTATE:
6922 case ROTATERT:
6923 case LSHIFTRT:
6924 case ASHIFTRT:
6925 op0 = XEXP (x, 0);
6926 op1 = XEXP (x, 1);
6928 if (CONST_INT_P (op1))
6930 /* ASR (immediate) and friends. */
6931 if (speed)
6933 if (VECTOR_MODE_P (mode))
6934 *cost += extra_cost->vect.alu;
6935 else
6936 *cost += extra_cost->alu.shift;
6939 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6940 return true;
6942 else
6945 /* ASR (register) and friends. */
6946 if (speed)
6948 if (VECTOR_MODE_P (mode))
6949 *cost += extra_cost->vect.alu;
6950 else
6951 *cost += extra_cost->alu.shift_reg;
6953 return false; /* All arguments need to be in registers. */
6956 case SYMBOL_REF:
6958 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6959 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6961 /* LDR. */
6962 if (speed)
6963 *cost += extra_cost->ldst.load;
6965 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6966 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6968 /* ADRP, followed by ADD. */
6969 *cost += COSTS_N_INSNS (1);
6970 if (speed)
6971 *cost += 2 * extra_cost->alu.arith;
6973 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6974 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6976 /* ADR. */
6977 if (speed)
6978 *cost += extra_cost->alu.arith;
6981 if (flag_pic)
6983 /* One extra load instruction, after accessing the GOT. */
6984 *cost += COSTS_N_INSNS (1);
6985 if (speed)
6986 *cost += extra_cost->ldst.load;
6988 return true;
6990 case HIGH:
6991 case LO_SUM:
6992 /* ADRP/ADD (immediate). */
6993 if (speed)
6994 *cost += extra_cost->alu.arith;
6995 return true;
6997 case ZERO_EXTRACT:
6998 case SIGN_EXTRACT:
6999 /* UBFX/SBFX. */
7000 if (speed)
7002 if (VECTOR_MODE_P (mode))
7003 *cost += extra_cost->vect.alu;
7004 else
7005 *cost += extra_cost->alu.bfx;
7008 /* We can trust that the immediates used will be correct (there
7009 are no by-register forms), so we need only cost op0. */
7010 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7011 return true;
7013 case MULT:
7014 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7015 /* aarch64_rtx_mult_cost always handles recursion to its
7016 operands. */
7017 return true;
7019 case MOD:
7020 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7021 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7022 an unconditional negate. This case should only ever be reached through
7023 the set_smod_pow2_cheap check in expmed.c. */
7024 if (CONST_INT_P (XEXP (x, 1))
7025 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7026 && (mode == SImode || mode == DImode))
7028 /* We expand to 4 instructions. Reset the baseline. */
7029 *cost = COSTS_N_INSNS (4);
7031 if (speed)
7032 *cost += 2 * extra_cost->alu.logical
7033 + 2 * extra_cost->alu.arith;
7035 return true;
7038 /* Fall-through. */
7039 case UMOD:
7040 if (speed)
7042 if (VECTOR_MODE_P (mode))
7043 *cost += extra_cost->vect.alu;
7044 else if (GET_MODE_CLASS (mode) == MODE_INT)
7045 *cost += (extra_cost->mult[mode == DImode].add
7046 + extra_cost->mult[mode == DImode].idiv);
7047 else if (mode == DFmode)
7048 *cost += (extra_cost->fp[1].mult
7049 + extra_cost->fp[1].div);
7050 else if (mode == SFmode)
7051 *cost += (extra_cost->fp[0].mult
7052 + extra_cost->fp[0].div);
7054 return false; /* All arguments need to be in registers. */
7056 case DIV:
7057 case UDIV:
7058 case SQRT:
7059 if (speed)
7061 if (VECTOR_MODE_P (mode))
7062 *cost += extra_cost->vect.alu;
7063 else if (GET_MODE_CLASS (mode) == MODE_INT)
7064 /* There is no integer SQRT, so only DIV and UDIV can get
7065 here. */
7066 *cost += extra_cost->mult[mode == DImode].idiv;
7067 else
7068 *cost += extra_cost->fp[mode == DFmode].div;
7070 return false; /* All arguments need to be in registers. */
7072 case IF_THEN_ELSE:
7073 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7074 XEXP (x, 2), cost, speed);
7076 case EQ:
7077 case NE:
7078 case GT:
7079 case GTU:
7080 case LT:
7081 case LTU:
7082 case GE:
7083 case GEU:
7084 case LE:
7085 case LEU:
7087 return false; /* All arguments must be in registers. */
7089 case FMA:
7090 op0 = XEXP (x, 0);
7091 op1 = XEXP (x, 1);
7092 op2 = XEXP (x, 2);
7094 if (speed)
7096 if (VECTOR_MODE_P (mode))
7097 *cost += extra_cost->vect.alu;
7098 else
7099 *cost += extra_cost->fp[mode == DFmode].fma;
7102 /* FMSUB, FNMADD, and FNMSUB are free. */
7103 if (GET_CODE (op0) == NEG)
7104 op0 = XEXP (op0, 0);
7106 if (GET_CODE (op2) == NEG)
7107 op2 = XEXP (op2, 0);
7109 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7110 and the by-element operand as operand 0. */
7111 if (GET_CODE (op1) == NEG)
7112 op1 = XEXP (op1, 0);
7114 /* Catch vector-by-element operations. The by-element operand can
7115 either be (vec_duplicate (vec_select (x))) or just
7116 (vec_select (x)), depending on whether we are multiplying by
7117 a vector or a scalar.
7119 Canonicalization is not very good in these cases, FMA4 will put the
7120 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7121 if (GET_CODE (op0) == VEC_DUPLICATE)
7122 op0 = XEXP (op0, 0);
7123 else if (GET_CODE (op1) == VEC_DUPLICATE)
7124 op1 = XEXP (op1, 0);
7126 if (GET_CODE (op0) == VEC_SELECT)
7127 op0 = XEXP (op0, 0);
7128 else if (GET_CODE (op1) == VEC_SELECT)
7129 op1 = XEXP (op1, 0);
7131 /* If the remaining parameters are not registers,
7132 get the cost to put them into registers. */
7133 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7134 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7135 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7136 return true;
7138 case FLOAT:
7139 case UNSIGNED_FLOAT:
7140 if (speed)
7141 *cost += extra_cost->fp[mode == DFmode].fromint;
7142 return false;
7144 case FLOAT_EXTEND:
7145 if (speed)
7147 if (VECTOR_MODE_P (mode))
7149 /*Vector truncate. */
7150 *cost += extra_cost->vect.alu;
7152 else
7153 *cost += extra_cost->fp[mode == DFmode].widen;
7155 return false;
7157 case FLOAT_TRUNCATE:
7158 if (speed)
7160 if (VECTOR_MODE_P (mode))
7162 /*Vector conversion. */
7163 *cost += extra_cost->vect.alu;
7165 else
7166 *cost += extra_cost->fp[mode == DFmode].narrow;
7168 return false;
7170 case FIX:
7171 case UNSIGNED_FIX:
7172 x = XEXP (x, 0);
7173 /* Strip the rounding part. They will all be implemented
7174 by the fcvt* family of instructions anyway. */
7175 if (GET_CODE (x) == UNSPEC)
7177 unsigned int uns_code = XINT (x, 1);
7179 if (uns_code == UNSPEC_FRINTA
7180 || uns_code == UNSPEC_FRINTM
7181 || uns_code == UNSPEC_FRINTN
7182 || uns_code == UNSPEC_FRINTP
7183 || uns_code == UNSPEC_FRINTZ)
7184 x = XVECEXP (x, 0, 0);
7187 if (speed)
7189 if (VECTOR_MODE_P (mode))
7190 *cost += extra_cost->vect.alu;
7191 else
7192 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7195 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7196 fixed-point fcvt. */
7197 if (GET_CODE (x) == MULT
7198 && ((VECTOR_MODE_P (mode)
7199 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7200 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7202 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7203 0, speed);
7204 return true;
7207 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7208 return true;
7210 case ABS:
7211 if (VECTOR_MODE_P (mode))
7213 /* ABS (vector). */
7214 if (speed)
7215 *cost += extra_cost->vect.alu;
7217 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7219 op0 = XEXP (x, 0);
7221 /* FABD, which is analogous to FADD. */
7222 if (GET_CODE (op0) == MINUS)
7224 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7225 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7226 if (speed)
7227 *cost += extra_cost->fp[mode == DFmode].addsub;
7229 return true;
7231 /* Simple FABS is analogous to FNEG. */
7232 if (speed)
7233 *cost += extra_cost->fp[mode == DFmode].neg;
7235 else
7237 /* Integer ABS will either be split to
7238 two arithmetic instructions, or will be an ABS
7239 (scalar), which we don't model. */
7240 *cost = COSTS_N_INSNS (2);
7241 if (speed)
7242 *cost += 2 * extra_cost->alu.arith;
7244 return false;
7246 case SMAX:
7247 case SMIN:
7248 if (speed)
7250 if (VECTOR_MODE_P (mode))
7251 *cost += extra_cost->vect.alu;
7252 else
7254 /* FMAXNM/FMINNM/FMAX/FMIN.
7255 TODO: This may not be accurate for all implementations, but
7256 we do not model this in the cost tables. */
7257 *cost += extra_cost->fp[mode == DFmode].addsub;
7260 return false;
7262 case UNSPEC:
7263 /* The floating point round to integer frint* instructions. */
7264 if (aarch64_frint_unspec_p (XINT (x, 1)))
7266 if (speed)
7267 *cost += extra_cost->fp[mode == DFmode].roundint;
7269 return false;
7272 if (XINT (x, 1) == UNSPEC_RBIT)
7274 if (speed)
7275 *cost += extra_cost->alu.rev;
7277 return false;
7279 break;
7281 case TRUNCATE:
7283 /* Decompose <su>muldi3_highpart. */
7284 if (/* (truncate:DI */
7285 mode == DImode
7286 /* (lshiftrt:TI */
7287 && GET_MODE (XEXP (x, 0)) == TImode
7288 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7289 /* (mult:TI */
7290 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7291 /* (ANY_EXTEND:TI (reg:DI))
7292 (ANY_EXTEND:TI (reg:DI))) */
7293 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7294 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7295 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7296 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7297 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7298 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7299 /* (const_int 64) */
7300 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7301 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7303 /* UMULH/SMULH. */
7304 if (speed)
7305 *cost += extra_cost->mult[mode == DImode].extend;
7306 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7307 mode, MULT, 0, speed);
7308 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7309 mode, MULT, 1, speed);
7310 return true;
7313 /* Fall through. */
7314 default:
7315 break;
7318 if (dump_file && (dump_flags & TDF_DETAILS))
7319 fprintf (dump_file,
7320 "\nFailed to cost RTX. Assuming default cost.\n");
7322 return true;
7325 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7326 calculated for X. This cost is stored in *COST. Returns true
7327 if the total cost of X was calculated. */
7328 static bool
7329 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7330 int param, int *cost, bool speed)
7332 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7334 if (dump_file && (dump_flags & TDF_DETAILS))
7336 print_rtl_single (dump_file, x);
7337 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7338 speed ? "Hot" : "Cold",
7339 *cost, result ? "final" : "partial");
7342 return result;
7345 static int
7346 aarch64_register_move_cost (machine_mode mode,
7347 reg_class_t from_i, reg_class_t to_i)
7349 enum reg_class from = (enum reg_class) from_i;
7350 enum reg_class to = (enum reg_class) to_i;
7351 const struct cpu_regmove_cost *regmove_cost
7352 = aarch64_tune_params.regmove_cost;
7354 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7355 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7356 to = GENERAL_REGS;
7358 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7359 from = GENERAL_REGS;
7361 /* Moving between GPR and stack cost is the same as GP2GP. */
7362 if ((from == GENERAL_REGS && to == STACK_REG)
7363 || (to == GENERAL_REGS && from == STACK_REG))
7364 return regmove_cost->GP2GP;
7366 /* To/From the stack register, we move via the gprs. */
7367 if (to == STACK_REG || from == STACK_REG)
7368 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7369 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7371 if (GET_MODE_SIZE (mode) == 16)
7373 /* 128-bit operations on general registers require 2 instructions. */
7374 if (from == GENERAL_REGS && to == GENERAL_REGS)
7375 return regmove_cost->GP2GP * 2;
7376 else if (from == GENERAL_REGS)
7377 return regmove_cost->GP2FP * 2;
7378 else if (to == GENERAL_REGS)
7379 return regmove_cost->FP2GP * 2;
7381 /* When AdvSIMD instructions are disabled it is not possible to move
7382 a 128-bit value directly between Q registers. This is handled in
7383 secondary reload. A general register is used as a scratch to move
7384 the upper DI value and the lower DI value is moved directly,
7385 hence the cost is the sum of three moves. */
7386 if (! TARGET_SIMD)
7387 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7389 return regmove_cost->FP2FP;
7392 if (from == GENERAL_REGS && to == GENERAL_REGS)
7393 return regmove_cost->GP2GP;
7394 else if (from == GENERAL_REGS)
7395 return regmove_cost->GP2FP;
7396 else if (to == GENERAL_REGS)
7397 return regmove_cost->FP2GP;
7399 return regmove_cost->FP2FP;
7402 static int
7403 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7404 reg_class_t rclass ATTRIBUTE_UNUSED,
7405 bool in ATTRIBUTE_UNUSED)
7407 return aarch64_tune_params.memmov_cost;
7410 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7411 to optimize 1.0/sqrt. */
7413 static bool
7414 use_rsqrt_p (machine_mode mode)
7416 return (!flag_trapping_math
7417 && flag_unsafe_math_optimizations
7418 && ((aarch64_tune_params.approx_modes->recip_sqrt
7419 & AARCH64_APPROX_MODE (mode))
7420 || flag_mrecip_low_precision_sqrt));
7423 /* Function to decide when to use the approximate reciprocal square root
7424 builtin. */
7426 static tree
7427 aarch64_builtin_reciprocal (tree fndecl)
7429 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7431 if (!use_rsqrt_p (mode))
7432 return NULL_TREE;
7433 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7436 typedef rtx (*rsqrte_type) (rtx, rtx);
7438 /* Select reciprocal square root initial estimate insn depending on machine
7439 mode. */
7441 static rsqrte_type
7442 get_rsqrte_type (machine_mode mode)
7444 switch (mode)
7446 case DFmode: return gen_aarch64_rsqrtedf;
7447 case SFmode: return gen_aarch64_rsqrtesf;
7448 case V2DFmode: return gen_aarch64_rsqrtev2df;
7449 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7450 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7451 default: gcc_unreachable ();
7455 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7457 /* Select reciprocal square root series step insn depending on machine mode. */
7459 static rsqrts_type
7460 get_rsqrts_type (machine_mode mode)
7462 switch (mode)
7464 case DFmode: return gen_aarch64_rsqrtsdf;
7465 case SFmode: return gen_aarch64_rsqrtssf;
7466 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7467 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7468 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7469 default: gcc_unreachable ();
7473 /* Emit instruction sequence to compute either the approximate square root
7474 or its approximate reciprocal, depending on the flag RECP, and return
7475 whether the sequence was emitted or not. */
7477 bool
7478 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7480 machine_mode mode = GET_MODE (dst);
7482 if (GET_MODE_INNER (mode) == HFmode)
7483 return false;
7485 machine_mode mmsk = mode_for_vector
7486 (int_mode_for_mode (GET_MODE_INNER (mode)),
7487 GET_MODE_NUNITS (mode));
7488 bool use_approx_sqrt_p = (!recp
7489 && (flag_mlow_precision_sqrt
7490 || (aarch64_tune_params.approx_modes->sqrt
7491 & AARCH64_APPROX_MODE (mode))));
7492 bool use_approx_rsqrt_p = (recp
7493 && (flag_mrecip_low_precision_sqrt
7494 || (aarch64_tune_params.approx_modes->recip_sqrt
7495 & AARCH64_APPROX_MODE (mode))));
7497 if (!flag_finite_math_only
7498 || flag_trapping_math
7499 || !flag_unsafe_math_optimizations
7500 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7501 || optimize_function_for_size_p (cfun))
7502 return false;
7504 rtx xmsk = gen_reg_rtx (mmsk);
7505 if (!recp)
7506 /* When calculating the approximate square root, compare the argument with
7507 0.0 and create a mask. */
7508 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7509 CONST0_RTX (mode)))));
7511 /* Estimate the approximate reciprocal square root. */
7512 rtx xdst = gen_reg_rtx (mode);
7513 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7515 /* Iterate over the series twice for SF and thrice for DF. */
7516 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7518 /* Optionally iterate over the series once less for faster performance
7519 while sacrificing the accuracy. */
7520 if ((recp && flag_mrecip_low_precision_sqrt)
7521 || (!recp && flag_mlow_precision_sqrt))
7522 iterations--;
7524 /* Iterate over the series to calculate the approximate reciprocal square
7525 root. */
7526 rtx x1 = gen_reg_rtx (mode);
7527 while (iterations--)
7529 rtx x2 = gen_reg_rtx (mode);
7530 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7532 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7534 if (iterations > 0)
7535 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7538 if (!recp)
7540 /* Qualify the approximate reciprocal square root when the argument is
7541 0.0 by squashing the intermediary result to 0.0. */
7542 rtx xtmp = gen_reg_rtx (mmsk);
7543 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7544 gen_rtx_SUBREG (mmsk, xdst, 0)));
7545 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7547 /* Calculate the approximate square root. */
7548 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7551 /* Finalize the approximation. */
7552 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7554 return true;
7557 typedef rtx (*recpe_type) (rtx, rtx);
7559 /* Select reciprocal initial estimate insn depending on machine mode. */
7561 static recpe_type
7562 get_recpe_type (machine_mode mode)
7564 switch (mode)
7566 case SFmode: return (gen_aarch64_frecpesf);
7567 case V2SFmode: return (gen_aarch64_frecpev2sf);
7568 case V4SFmode: return (gen_aarch64_frecpev4sf);
7569 case DFmode: return (gen_aarch64_frecpedf);
7570 case V2DFmode: return (gen_aarch64_frecpev2df);
7571 default: gcc_unreachable ();
7575 typedef rtx (*recps_type) (rtx, rtx, rtx);
7577 /* Select reciprocal series step insn depending on machine mode. */
7579 static recps_type
7580 get_recps_type (machine_mode mode)
7582 switch (mode)
7584 case SFmode: return (gen_aarch64_frecpssf);
7585 case V2SFmode: return (gen_aarch64_frecpsv2sf);
7586 case V4SFmode: return (gen_aarch64_frecpsv4sf);
7587 case DFmode: return (gen_aarch64_frecpsdf);
7588 case V2DFmode: return (gen_aarch64_frecpsv2df);
7589 default: gcc_unreachable ();
7593 /* Emit the instruction sequence to compute the approximation for the division
7594 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7596 bool
7597 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7599 machine_mode mode = GET_MODE (quo);
7601 if (GET_MODE_INNER (mode) == HFmode)
7602 return false;
7604 bool use_approx_division_p = (flag_mlow_precision_div
7605 || (aarch64_tune_params.approx_modes->division
7606 & AARCH64_APPROX_MODE (mode)));
7608 if (!flag_finite_math_only
7609 || flag_trapping_math
7610 || !flag_unsafe_math_optimizations
7611 || optimize_function_for_size_p (cfun)
7612 || !use_approx_division_p)
7613 return false;
7615 /* Estimate the approximate reciprocal. */
7616 rtx xrcp = gen_reg_rtx (mode);
7617 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7619 /* Iterate over the series twice for SF and thrice for DF. */
7620 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7622 /* Optionally iterate over the series once less for faster performance,
7623 while sacrificing the accuracy. */
7624 if (flag_mlow_precision_div)
7625 iterations--;
7627 /* Iterate over the series to calculate the approximate reciprocal. */
7628 rtx xtmp = gen_reg_rtx (mode);
7629 while (iterations--)
7631 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7633 if (iterations > 0)
7634 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7637 if (num != CONST1_RTX (mode))
7639 /* As the approximate reciprocal of DEN is already calculated, only
7640 calculate the approximate division when NUM is not 1.0. */
7641 rtx xnum = force_reg (mode, num);
7642 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7645 /* Finalize the approximation. */
7646 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7647 return true;
7650 /* Return the number of instructions that can be issued per cycle. */
7651 static int
7652 aarch64_sched_issue_rate (void)
7654 return aarch64_tune_params.issue_rate;
7657 static int
7658 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7660 int issue_rate = aarch64_sched_issue_rate ();
7662 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7666 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7667 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7668 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7670 static int
7671 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7672 int ready_index)
7674 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7678 /* Vectorizer cost model target hooks. */
7680 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7681 static int
7682 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7683 tree vectype,
7684 int misalign ATTRIBUTE_UNUSED)
7686 unsigned elements;
7688 switch (type_of_cost)
7690 case scalar_stmt:
7691 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7693 case scalar_load:
7694 return aarch64_tune_params.vec_costs->scalar_load_cost;
7696 case scalar_store:
7697 return aarch64_tune_params.vec_costs->scalar_store_cost;
7699 case vector_stmt:
7700 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7702 case vector_load:
7703 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7705 case vector_store:
7706 return aarch64_tune_params.vec_costs->vec_store_cost;
7708 case vec_to_scalar:
7709 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7711 case scalar_to_vec:
7712 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7714 case unaligned_load:
7715 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7717 case unaligned_store:
7718 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7720 case cond_branch_taken:
7721 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7723 case cond_branch_not_taken:
7724 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7726 case vec_perm:
7727 return aarch64_tune_params.vec_costs->vec_permute_cost;
7729 case vec_promote_demote:
7730 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7732 case vec_construct:
7733 elements = TYPE_VECTOR_SUBPARTS (vectype);
7734 return elements / 2 + 1;
7736 default:
7737 gcc_unreachable ();
7741 /* Implement targetm.vectorize.add_stmt_cost. */
7742 static unsigned
7743 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7744 struct _stmt_vec_info *stmt_info, int misalign,
7745 enum vect_cost_model_location where)
7747 unsigned *cost = (unsigned *) data;
7748 unsigned retval = 0;
7750 if (flag_vect_cost_model)
7752 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7753 int stmt_cost =
7754 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7756 /* Statements in an inner loop relative to the loop being
7757 vectorized are weighted more heavily. The value here is
7758 arbitrary and could potentially be improved with analysis. */
7759 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7760 count *= 50; /* FIXME */
7762 retval = (unsigned) (count * stmt_cost);
7763 cost[where] += retval;
7766 return retval;
7769 static void initialize_aarch64_code_model (struct gcc_options *);
7771 /* Parse the TO_PARSE string and put the architecture struct that it
7772 selects into RES and the architectural features into ISA_FLAGS.
7773 Return an aarch64_parse_opt_result describing the parse result.
7774 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7776 static enum aarch64_parse_opt_result
7777 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7778 unsigned long *isa_flags)
7780 char *ext;
7781 const struct processor *arch;
7782 char *str = (char *) alloca (strlen (to_parse) + 1);
7783 size_t len;
7785 strcpy (str, to_parse);
7787 ext = strchr (str, '+');
7789 if (ext != NULL)
7790 len = ext - str;
7791 else
7792 len = strlen (str);
7794 if (len == 0)
7795 return AARCH64_PARSE_MISSING_ARG;
7798 /* Loop through the list of supported ARCHes to find a match. */
7799 for (arch = all_architectures; arch->name != NULL; arch++)
7801 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7803 unsigned long isa_temp = arch->flags;
7805 if (ext != NULL)
7807 /* TO_PARSE string contains at least one extension. */
7808 enum aarch64_parse_opt_result ext_res
7809 = aarch64_parse_extension (ext, &isa_temp);
7811 if (ext_res != AARCH64_PARSE_OK)
7812 return ext_res;
7814 /* Extension parsing was successful. Confirm the result
7815 arch and ISA flags. */
7816 *res = arch;
7817 *isa_flags = isa_temp;
7818 return AARCH64_PARSE_OK;
7822 /* ARCH name not found in list. */
7823 return AARCH64_PARSE_INVALID_ARG;
7826 /* Parse the TO_PARSE string and put the result tuning in RES and the
7827 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7828 describing the parse result. If there is an error parsing, RES and
7829 ISA_FLAGS are left unchanged. */
7831 static enum aarch64_parse_opt_result
7832 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7833 unsigned long *isa_flags)
7835 char *ext;
7836 const struct processor *cpu;
7837 char *str = (char *) alloca (strlen (to_parse) + 1);
7838 size_t len;
7840 strcpy (str, to_parse);
7842 ext = strchr (str, '+');
7844 if (ext != NULL)
7845 len = ext - str;
7846 else
7847 len = strlen (str);
7849 if (len == 0)
7850 return AARCH64_PARSE_MISSING_ARG;
7853 /* Loop through the list of supported CPUs to find a match. */
7854 for (cpu = all_cores; cpu->name != NULL; cpu++)
7856 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7858 unsigned long isa_temp = cpu->flags;
7861 if (ext != NULL)
7863 /* TO_PARSE string contains at least one extension. */
7864 enum aarch64_parse_opt_result ext_res
7865 = aarch64_parse_extension (ext, &isa_temp);
7867 if (ext_res != AARCH64_PARSE_OK)
7868 return ext_res;
7870 /* Extension parsing was successfull. Confirm the result
7871 cpu and ISA flags. */
7872 *res = cpu;
7873 *isa_flags = isa_temp;
7874 return AARCH64_PARSE_OK;
7878 /* CPU name not found in list. */
7879 return AARCH64_PARSE_INVALID_ARG;
7882 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7883 Return an aarch64_parse_opt_result describing the parse result.
7884 If the parsing fails the RES does not change. */
7886 static enum aarch64_parse_opt_result
7887 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7889 const struct processor *cpu;
7890 char *str = (char *) alloca (strlen (to_parse) + 1);
7892 strcpy (str, to_parse);
7894 /* Loop through the list of supported CPUs to find a match. */
7895 for (cpu = all_cores; cpu->name != NULL; cpu++)
7897 if (strcmp (cpu->name, str) == 0)
7899 *res = cpu;
7900 return AARCH64_PARSE_OK;
7904 /* CPU name not found in list. */
7905 return AARCH64_PARSE_INVALID_ARG;
7908 /* Parse TOKEN, which has length LENGTH to see if it is an option
7909 described in FLAG. If it is, return the index bit for that fusion type.
7910 If not, error (printing OPTION_NAME) and return zero. */
7912 static unsigned int
7913 aarch64_parse_one_option_token (const char *token,
7914 size_t length,
7915 const struct aarch64_flag_desc *flag,
7916 const char *option_name)
7918 for (; flag->name != NULL; flag++)
7920 if (length == strlen (flag->name)
7921 && !strncmp (flag->name, token, length))
7922 return flag->flag;
7925 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7926 return 0;
7929 /* Parse OPTION which is a comma-separated list of flags to enable.
7930 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7931 default state we inherit from the CPU tuning structures. OPTION_NAME
7932 gives the top-level option we are parsing in the -moverride string,
7933 for use in error messages. */
7935 static unsigned int
7936 aarch64_parse_boolean_options (const char *option,
7937 const struct aarch64_flag_desc *flags,
7938 unsigned int initial_state,
7939 const char *option_name)
7941 const char separator = '.';
7942 const char* specs = option;
7943 const char* ntoken = option;
7944 unsigned int found_flags = initial_state;
7946 while ((ntoken = strchr (specs, separator)))
7948 size_t token_length = ntoken - specs;
7949 unsigned token_ops = aarch64_parse_one_option_token (specs,
7950 token_length,
7951 flags,
7952 option_name);
7953 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7954 in the token stream, reset the supported operations. So:
7956 adrp+add.cmp+branch.none.adrp+add
7958 would have the result of turning on only adrp+add fusion. */
7959 if (!token_ops)
7960 found_flags = 0;
7962 found_flags |= token_ops;
7963 specs = ++ntoken;
7966 /* We ended with a comma, print something. */
7967 if (!(*specs))
7969 error ("%s string ill-formed\n", option_name);
7970 return 0;
7973 /* We still have one more token to parse. */
7974 size_t token_length = strlen (specs);
7975 unsigned token_ops = aarch64_parse_one_option_token (specs,
7976 token_length,
7977 flags,
7978 option_name);
7979 if (!token_ops)
7980 found_flags = 0;
7982 found_flags |= token_ops;
7983 return found_flags;
7986 /* Support for overriding instruction fusion. */
7988 static void
7989 aarch64_parse_fuse_string (const char *fuse_string,
7990 struct tune_params *tune)
7992 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7993 aarch64_fusible_pairs,
7994 tune->fusible_ops,
7995 "fuse=");
7998 /* Support for overriding other tuning flags. */
8000 static void
8001 aarch64_parse_tune_string (const char *tune_string,
8002 struct tune_params *tune)
8004 tune->extra_tuning_flags
8005 = aarch64_parse_boolean_options (tune_string,
8006 aarch64_tuning_flags,
8007 tune->extra_tuning_flags,
8008 "tune=");
8011 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8012 we understand. If it is, extract the option string and handoff to
8013 the appropriate function. */
8015 void
8016 aarch64_parse_one_override_token (const char* token,
8017 size_t length,
8018 struct tune_params *tune)
8020 const struct aarch64_tuning_override_function *fn
8021 = aarch64_tuning_override_functions;
8023 const char *option_part = strchr (token, '=');
8024 if (!option_part)
8026 error ("tuning string missing in option (%s)", token);
8027 return;
8030 /* Get the length of the option name. */
8031 length = option_part - token;
8032 /* Skip the '=' to get to the option string. */
8033 option_part++;
8035 for (; fn->name != NULL; fn++)
8037 if (!strncmp (fn->name, token, length))
8039 fn->parse_override (option_part, tune);
8040 return;
8044 error ("unknown tuning option (%s)",token);
8045 return;
8048 /* A checking mechanism for the implementation of the tls size. */
8050 static void
8051 initialize_aarch64_tls_size (struct gcc_options *opts)
8053 if (aarch64_tls_size == 0)
8054 aarch64_tls_size = 24;
8056 switch (opts->x_aarch64_cmodel_var)
8058 case AARCH64_CMODEL_TINY:
8059 /* Both the default and maximum TLS size allowed under tiny is 1M which
8060 needs two instructions to address, so we clamp the size to 24. */
8061 if (aarch64_tls_size > 24)
8062 aarch64_tls_size = 24;
8063 break;
8064 case AARCH64_CMODEL_SMALL:
8065 /* The maximum TLS size allowed under small is 4G. */
8066 if (aarch64_tls_size > 32)
8067 aarch64_tls_size = 32;
8068 break;
8069 case AARCH64_CMODEL_LARGE:
8070 /* The maximum TLS size allowed under large is 16E.
8071 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8072 if (aarch64_tls_size > 48)
8073 aarch64_tls_size = 48;
8074 break;
8075 default:
8076 gcc_unreachable ();
8079 return;
8082 /* Parse STRING looking for options in the format:
8083 string :: option:string
8084 option :: name=substring
8085 name :: {a-z}
8086 substring :: defined by option. */
8088 static void
8089 aarch64_parse_override_string (const char* input_string,
8090 struct tune_params* tune)
8092 const char separator = ':';
8093 size_t string_length = strlen (input_string) + 1;
8094 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8095 char *string = string_root;
8096 strncpy (string, input_string, string_length);
8097 string[string_length - 1] = '\0';
8099 char* ntoken = string;
8101 while ((ntoken = strchr (string, separator)))
8103 size_t token_length = ntoken - string;
8104 /* Make this substring look like a string. */
8105 *ntoken = '\0';
8106 aarch64_parse_one_override_token (string, token_length, tune);
8107 string = ++ntoken;
8110 /* One last option to parse. */
8111 aarch64_parse_one_override_token (string, strlen (string), tune);
8112 free (string_root);
8116 static void
8117 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8119 /* The logic here is that if we are disabling all frame pointer generation
8120 then we do not need to disable leaf frame pointer generation as a
8121 separate operation. But if we are *only* disabling leaf frame pointer
8122 generation then we set flag_omit_frame_pointer to true, but in
8123 aarch64_frame_pointer_required we return false only for leaf functions.
8125 PR 70044: We have to be careful about being called multiple times for the
8126 same function. Once we have decided to set flag_omit_frame_pointer just
8127 so that we can omit leaf frame pointers, we must then not interpret a
8128 second call as meaning that all frame pointer generation should be
8129 omitted. We do this by setting flag_omit_frame_pointer to a special,
8130 non-zero value. */
8131 if (opts->x_flag_omit_frame_pointer == 2)
8132 opts->x_flag_omit_frame_pointer = 0;
8134 if (opts->x_flag_omit_frame_pointer)
8135 opts->x_flag_omit_leaf_frame_pointer = false;
8136 else if (opts->x_flag_omit_leaf_frame_pointer)
8137 opts->x_flag_omit_frame_pointer = 2;
8139 /* If not optimizing for size, set the default
8140 alignment to what the target wants. */
8141 if (!opts->x_optimize_size)
8143 if (opts->x_align_loops <= 0)
8144 opts->x_align_loops = aarch64_tune_params.loop_align;
8145 if (opts->x_align_jumps <= 0)
8146 opts->x_align_jumps = aarch64_tune_params.jump_align;
8147 if (opts->x_align_functions <= 0)
8148 opts->x_align_functions = aarch64_tune_params.function_align;
8151 /* We default to no pc-relative literal loads. */
8153 aarch64_pcrelative_literal_loads = false;
8155 /* If -mpc-relative-literal-loads is set on the command line, this
8156 implies that the user asked for PC relative literal loads. */
8157 if (opts->x_pcrelative_literal_loads == 1)
8158 aarch64_pcrelative_literal_loads = true;
8160 /* This is PR70113. When building the Linux kernel with
8161 CONFIG_ARM64_ERRATUM_843419, support for relocations
8162 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8163 removed from the kernel to avoid loading objects with possibly
8164 offending sequences. Without -mpc-relative-literal-loads we would
8165 generate such relocations, preventing the kernel build from
8166 succeeding. */
8167 if (opts->x_pcrelative_literal_loads == 2
8168 && TARGET_FIX_ERR_A53_843419)
8169 aarch64_pcrelative_literal_loads = true;
8171 /* In the tiny memory model it makes no sense to disallow PC relative
8172 literal pool loads. */
8173 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8174 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8175 aarch64_pcrelative_literal_loads = true;
8177 /* When enabling the lower precision Newton series for the square root, also
8178 enable it for the reciprocal square root, since the latter is an
8179 intermediary step for the former. */
8180 if (flag_mlow_precision_sqrt)
8181 flag_mrecip_low_precision_sqrt = true;
8184 /* 'Unpack' up the internal tuning structs and update the options
8185 in OPTS. The caller must have set up selected_tune and selected_arch
8186 as all the other target-specific codegen decisions are
8187 derived from them. */
8189 void
8190 aarch64_override_options_internal (struct gcc_options *opts)
8192 aarch64_tune_flags = selected_tune->flags;
8193 aarch64_tune = selected_tune->sched_core;
8194 /* Make a copy of the tuning parameters attached to the core, which
8195 we may later overwrite. */
8196 aarch64_tune_params = *(selected_tune->tune);
8197 aarch64_architecture_version = selected_arch->architecture_version;
8199 if (opts->x_aarch64_override_tune_string)
8200 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8201 &aarch64_tune_params);
8203 /* This target defaults to strict volatile bitfields. */
8204 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8205 opts->x_flag_strict_volatile_bitfields = 1;
8207 initialize_aarch64_code_model (opts);
8208 initialize_aarch64_tls_size (opts);
8210 int queue_depth = 0;
8211 switch (aarch64_tune_params.autoprefetcher_model)
8213 case tune_params::AUTOPREFETCHER_OFF:
8214 queue_depth = -1;
8215 break;
8216 case tune_params::AUTOPREFETCHER_WEAK:
8217 queue_depth = 0;
8218 break;
8219 case tune_params::AUTOPREFETCHER_STRONG:
8220 queue_depth = max_insn_queue_index + 1;
8221 break;
8222 default:
8223 gcc_unreachable ();
8226 /* We don't mind passing in global_options_set here as we don't use
8227 the *options_set structs anyway. */
8228 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8229 queue_depth,
8230 opts->x_param_values,
8231 global_options_set.x_param_values);
8233 /* Set the L1 cache line size. */
8234 if (selected_cpu->tune->cache_line_size != 0)
8235 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8236 selected_cpu->tune->cache_line_size,
8237 opts->x_param_values,
8238 global_options_set.x_param_values);
8240 aarch64_override_options_after_change_1 (opts);
8243 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8244 specified in STR and throw errors if appropriate. Put the results if
8245 they are valid in RES and ISA_FLAGS. Return whether the option is
8246 valid. */
8248 static bool
8249 aarch64_validate_mcpu (const char *str, const struct processor **res,
8250 unsigned long *isa_flags)
8252 enum aarch64_parse_opt_result parse_res
8253 = aarch64_parse_cpu (str, res, isa_flags);
8255 if (parse_res == AARCH64_PARSE_OK)
8256 return true;
8258 switch (parse_res)
8260 case AARCH64_PARSE_MISSING_ARG:
8261 error ("missing cpu name in -mcpu=%qs", str);
8262 break;
8263 case AARCH64_PARSE_INVALID_ARG:
8264 error ("unknown value %qs for -mcpu", str);
8265 break;
8266 case AARCH64_PARSE_INVALID_FEATURE:
8267 error ("invalid feature modifier in -mcpu=%qs", str);
8268 break;
8269 default:
8270 gcc_unreachable ();
8273 return false;
8276 /* Validate a command-line -march option. Parse the arch and extensions
8277 (if any) specified in STR and throw errors if appropriate. Put the
8278 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8279 option is valid. */
8281 static bool
8282 aarch64_validate_march (const char *str, const struct processor **res,
8283 unsigned long *isa_flags)
8285 enum aarch64_parse_opt_result parse_res
8286 = aarch64_parse_arch (str, res, isa_flags);
8288 if (parse_res == AARCH64_PARSE_OK)
8289 return true;
8291 switch (parse_res)
8293 case AARCH64_PARSE_MISSING_ARG:
8294 error ("missing arch name in -march=%qs", str);
8295 break;
8296 case AARCH64_PARSE_INVALID_ARG:
8297 error ("unknown value %qs for -march", str);
8298 break;
8299 case AARCH64_PARSE_INVALID_FEATURE:
8300 error ("invalid feature modifier in -march=%qs", str);
8301 break;
8302 default:
8303 gcc_unreachable ();
8306 return false;
8309 /* Validate a command-line -mtune option. Parse the cpu
8310 specified in STR and throw errors if appropriate. Put the
8311 result, if it is valid, in RES. Return whether the option is
8312 valid. */
8314 static bool
8315 aarch64_validate_mtune (const char *str, const struct processor **res)
8317 enum aarch64_parse_opt_result parse_res
8318 = aarch64_parse_tune (str, res);
8320 if (parse_res == AARCH64_PARSE_OK)
8321 return true;
8323 switch (parse_res)
8325 case AARCH64_PARSE_MISSING_ARG:
8326 error ("missing cpu name in -mtune=%qs", str);
8327 break;
8328 case AARCH64_PARSE_INVALID_ARG:
8329 error ("unknown value %qs for -mtune", str);
8330 break;
8331 default:
8332 gcc_unreachable ();
8334 return false;
8337 /* Return the CPU corresponding to the enum CPU.
8338 If it doesn't specify a cpu, return the default. */
8340 static const struct processor *
8341 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8343 if (cpu != aarch64_none)
8344 return &all_cores[cpu];
8346 /* The & 0x3f is to extract the bottom 6 bits that encode the
8347 default cpu as selected by the --with-cpu GCC configure option
8348 in config.gcc.
8349 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8350 flags mechanism should be reworked to make it more sane. */
8351 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8354 /* Return the architecture corresponding to the enum ARCH.
8355 If it doesn't specify a valid architecture, return the default. */
8357 static const struct processor *
8358 aarch64_get_arch (enum aarch64_arch arch)
8360 if (arch != aarch64_no_arch)
8361 return &all_architectures[arch];
8363 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8365 return &all_architectures[cpu->arch];
8368 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8369 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8370 tuning structs. In particular it must set selected_tune and
8371 aarch64_isa_flags that define the available ISA features and tuning
8372 decisions. It must also set selected_arch as this will be used to
8373 output the .arch asm tags for each function. */
8375 static void
8376 aarch64_override_options (void)
8378 unsigned long cpu_isa = 0;
8379 unsigned long arch_isa = 0;
8380 aarch64_isa_flags = 0;
8382 bool valid_cpu = true;
8383 bool valid_tune = true;
8384 bool valid_arch = true;
8386 selected_cpu = NULL;
8387 selected_arch = NULL;
8388 selected_tune = NULL;
8390 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8391 If either of -march or -mtune is given, they override their
8392 respective component of -mcpu. */
8393 if (aarch64_cpu_string)
8394 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8395 &cpu_isa);
8397 if (aarch64_arch_string)
8398 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8399 &arch_isa);
8401 if (aarch64_tune_string)
8402 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8404 /* If the user did not specify a processor, choose the default
8405 one for them. This will be the CPU set during configuration using
8406 --with-cpu, otherwise it is "generic". */
8407 if (!selected_cpu)
8409 if (selected_arch)
8411 selected_cpu = &all_cores[selected_arch->ident];
8412 aarch64_isa_flags = arch_isa;
8413 explicit_arch = selected_arch->arch;
8415 else
8417 /* Get default configure-time CPU. */
8418 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8419 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8422 if (selected_tune)
8423 explicit_tune_core = selected_tune->ident;
8425 /* If both -mcpu and -march are specified check that they are architecturally
8426 compatible, warn if they're not and prefer the -march ISA flags. */
8427 else if (selected_arch)
8429 if (selected_arch->arch != selected_cpu->arch)
8431 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8432 all_architectures[selected_cpu->arch].name,
8433 selected_arch->name);
8435 aarch64_isa_flags = arch_isa;
8436 explicit_arch = selected_arch->arch;
8437 explicit_tune_core = selected_tune ? selected_tune->ident
8438 : selected_cpu->ident;
8440 else
8442 /* -mcpu but no -march. */
8443 aarch64_isa_flags = cpu_isa;
8444 explicit_tune_core = selected_tune ? selected_tune->ident
8445 : selected_cpu->ident;
8446 gcc_assert (selected_cpu);
8447 selected_arch = &all_architectures[selected_cpu->arch];
8448 explicit_arch = selected_arch->arch;
8451 /* Set the arch as well as we will need it when outputing
8452 the .arch directive in assembly. */
8453 if (!selected_arch)
8455 gcc_assert (selected_cpu);
8456 selected_arch = &all_architectures[selected_cpu->arch];
8459 if (!selected_tune)
8460 selected_tune = selected_cpu;
8462 #ifndef HAVE_AS_MABI_OPTION
8463 /* The compiler may have been configured with 2.23.* binutils, which does
8464 not have support for ILP32. */
8465 if (TARGET_ILP32)
8466 error ("Assembler does not support -mabi=ilp32");
8467 #endif
8469 /* Make sure we properly set up the explicit options. */
8470 if ((aarch64_cpu_string && valid_cpu)
8471 || (aarch64_tune_string && valid_tune))
8472 gcc_assert (explicit_tune_core != aarch64_none);
8474 if ((aarch64_cpu_string && valid_cpu)
8475 || (aarch64_arch_string && valid_arch))
8476 gcc_assert (explicit_arch != aarch64_no_arch);
8478 aarch64_override_options_internal (&global_options);
8480 /* Save these options as the default ones in case we push and pop them later
8481 while processing functions with potential target attributes. */
8482 target_option_default_node = target_option_current_node
8483 = build_target_option_node (&global_options);
8485 aarch64_register_fma_steering ();
8489 /* Implement targetm.override_options_after_change. */
8491 static void
8492 aarch64_override_options_after_change (void)
8494 aarch64_override_options_after_change_1 (&global_options);
8497 static struct machine_function *
8498 aarch64_init_machine_status (void)
8500 struct machine_function *machine;
8501 machine = ggc_cleared_alloc<machine_function> ();
8502 return machine;
8505 void
8506 aarch64_init_expanders (void)
8508 init_machine_status = aarch64_init_machine_status;
8511 /* A checking mechanism for the implementation of the various code models. */
8512 static void
8513 initialize_aarch64_code_model (struct gcc_options *opts)
8515 if (opts->x_flag_pic)
8517 switch (opts->x_aarch64_cmodel_var)
8519 case AARCH64_CMODEL_TINY:
8520 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8521 break;
8522 case AARCH64_CMODEL_SMALL:
8523 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8524 aarch64_cmodel = (flag_pic == 2
8525 ? AARCH64_CMODEL_SMALL_PIC
8526 : AARCH64_CMODEL_SMALL_SPIC);
8527 #else
8528 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8529 #endif
8530 break;
8531 case AARCH64_CMODEL_LARGE:
8532 sorry ("code model %qs with -f%s", "large",
8533 opts->x_flag_pic > 1 ? "PIC" : "pic");
8534 break;
8535 default:
8536 gcc_unreachable ();
8539 else
8540 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8543 /* Implement TARGET_OPTION_SAVE. */
8545 static void
8546 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8548 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8551 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8552 using the information saved in PTR. */
8554 static void
8555 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8557 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8558 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8559 opts->x_explicit_arch = ptr->x_explicit_arch;
8560 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8561 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8563 aarch64_override_options_internal (opts);
8566 /* Implement TARGET_OPTION_PRINT. */
8568 static void
8569 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8571 const struct processor *cpu
8572 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8573 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8574 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8575 std::string extension
8576 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8578 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8579 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8580 arch->name, extension.c_str ());
8583 static GTY(()) tree aarch64_previous_fndecl;
8585 void
8586 aarch64_reset_previous_fndecl (void)
8588 aarch64_previous_fndecl = NULL;
8591 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8592 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8593 make sure optab availability predicates are recomputed when necessary. */
8595 void
8596 aarch64_save_restore_target_globals (tree new_tree)
8598 if (TREE_TARGET_GLOBALS (new_tree))
8599 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8600 else if (new_tree == target_option_default_node)
8601 restore_target_globals (&default_target_globals);
8602 else
8603 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8606 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8607 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8608 of the function, if such exists. This function may be called multiple
8609 times on a single function so use aarch64_previous_fndecl to avoid
8610 setting up identical state. */
8612 static void
8613 aarch64_set_current_function (tree fndecl)
8615 if (!fndecl || fndecl == aarch64_previous_fndecl)
8616 return;
8618 tree old_tree = (aarch64_previous_fndecl
8619 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8620 : NULL_TREE);
8622 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8624 /* If current function has no attributes but the previous one did,
8625 use the default node. */
8626 if (!new_tree && old_tree)
8627 new_tree = target_option_default_node;
8629 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
8630 the default have been handled by aarch64_save_restore_target_globals from
8631 aarch64_pragma_target_parse. */
8632 if (old_tree == new_tree)
8633 return;
8635 aarch64_previous_fndecl = fndecl;
8637 /* First set the target options. */
8638 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8640 aarch64_save_restore_target_globals (new_tree);
8643 /* Enum describing the various ways we can handle attributes.
8644 In many cases we can reuse the generic option handling machinery. */
8646 enum aarch64_attr_opt_type
8648 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8649 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8650 aarch64_attr_enum, /* Attribute sets an enum variable. */
8651 aarch64_attr_custom /* Attribute requires a custom handling function. */
8654 /* All the information needed to handle a target attribute.
8655 NAME is the name of the attribute.
8656 ATTR_TYPE specifies the type of behavior of the attribute as described
8657 in the definition of enum aarch64_attr_opt_type.
8658 ALLOW_NEG is true if the attribute supports a "no-" form.
8659 HANDLER is the function that takes the attribute string and whether
8660 it is a pragma or attribute and handles the option. It is needed only
8661 when the ATTR_TYPE is aarch64_attr_custom.
8662 OPT_NUM is the enum specifying the option that the attribute modifies.
8663 This is needed for attributes that mirror the behavior of a command-line
8664 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8665 aarch64_attr_enum. */
8667 struct aarch64_attribute_info
8669 const char *name;
8670 enum aarch64_attr_opt_type attr_type;
8671 bool allow_neg;
8672 bool (*handler) (const char *, const char *);
8673 enum opt_code opt_num;
8676 /* Handle the ARCH_STR argument to the arch= target attribute.
8677 PRAGMA_OR_ATTR is used in potential error messages. */
8679 static bool
8680 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8682 const struct processor *tmp_arch = NULL;
8683 enum aarch64_parse_opt_result parse_res
8684 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8686 if (parse_res == AARCH64_PARSE_OK)
8688 gcc_assert (tmp_arch);
8689 selected_arch = tmp_arch;
8690 explicit_arch = selected_arch->arch;
8691 return true;
8694 switch (parse_res)
8696 case AARCH64_PARSE_MISSING_ARG:
8697 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8698 break;
8699 case AARCH64_PARSE_INVALID_ARG:
8700 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8701 break;
8702 case AARCH64_PARSE_INVALID_FEATURE:
8703 error ("invalid feature modifier %qs for 'arch' target %s",
8704 str, pragma_or_attr);
8705 break;
8706 default:
8707 gcc_unreachable ();
8710 return false;
8713 /* Handle the argument CPU_STR to the cpu= target attribute.
8714 PRAGMA_OR_ATTR is used in potential error messages. */
8716 static bool
8717 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8719 const struct processor *tmp_cpu = NULL;
8720 enum aarch64_parse_opt_result parse_res
8721 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8723 if (parse_res == AARCH64_PARSE_OK)
8725 gcc_assert (tmp_cpu);
8726 selected_tune = tmp_cpu;
8727 explicit_tune_core = selected_tune->ident;
8729 selected_arch = &all_architectures[tmp_cpu->arch];
8730 explicit_arch = selected_arch->arch;
8731 return true;
8734 switch (parse_res)
8736 case AARCH64_PARSE_MISSING_ARG:
8737 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8738 break;
8739 case AARCH64_PARSE_INVALID_ARG:
8740 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8741 break;
8742 case AARCH64_PARSE_INVALID_FEATURE:
8743 error ("invalid feature modifier %qs for 'cpu' target %s",
8744 str, pragma_or_attr);
8745 break;
8746 default:
8747 gcc_unreachable ();
8750 return false;
8753 /* Handle the argument STR to the tune= target attribute.
8754 PRAGMA_OR_ATTR is used in potential error messages. */
8756 static bool
8757 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8759 const struct processor *tmp_tune = NULL;
8760 enum aarch64_parse_opt_result parse_res
8761 = aarch64_parse_tune (str, &tmp_tune);
8763 if (parse_res == AARCH64_PARSE_OK)
8765 gcc_assert (tmp_tune);
8766 selected_tune = tmp_tune;
8767 explicit_tune_core = selected_tune->ident;
8768 return true;
8771 switch (parse_res)
8773 case AARCH64_PARSE_INVALID_ARG:
8774 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8775 break;
8776 default:
8777 gcc_unreachable ();
8780 return false;
8783 /* Parse an architecture extensions target attribute string specified in STR.
8784 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8785 if successful. Update aarch64_isa_flags to reflect the ISA features
8786 modified.
8787 PRAGMA_OR_ATTR is used in potential error messages. */
8789 static bool
8790 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8792 enum aarch64_parse_opt_result parse_res;
8793 unsigned long isa_flags = aarch64_isa_flags;
8795 /* We allow "+nothing" in the beginning to clear out all architectural
8796 features if the user wants to handpick specific features. */
8797 if (strncmp ("+nothing", str, 8) == 0)
8799 isa_flags = 0;
8800 str += 8;
8803 parse_res = aarch64_parse_extension (str, &isa_flags);
8805 if (parse_res == AARCH64_PARSE_OK)
8807 aarch64_isa_flags = isa_flags;
8808 return true;
8811 switch (parse_res)
8813 case AARCH64_PARSE_MISSING_ARG:
8814 error ("missing feature modifier in target %s %qs",
8815 pragma_or_attr, str);
8816 break;
8818 case AARCH64_PARSE_INVALID_FEATURE:
8819 error ("invalid feature modifier in target %s %qs",
8820 pragma_or_attr, str);
8821 break;
8823 default:
8824 gcc_unreachable ();
8827 return false;
8830 /* The target attributes that we support. On top of these we also support just
8831 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8832 handled explicitly in aarch64_process_one_target_attr. */
8834 static const struct aarch64_attribute_info aarch64_attributes[] =
8836 { "general-regs-only", aarch64_attr_mask, false, NULL,
8837 OPT_mgeneral_regs_only },
8838 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8839 OPT_mfix_cortex_a53_835769 },
8840 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8841 OPT_mfix_cortex_a53_843419 },
8842 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8843 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8844 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8845 OPT_momit_leaf_frame_pointer },
8846 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8847 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8848 OPT_march_ },
8849 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8850 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8851 OPT_mtune_ },
8852 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8855 /* Parse ARG_STR which contains the definition of one target attribute.
8856 Show appropriate errors if any or return true if the attribute is valid.
8857 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8858 we're processing a target attribute or pragma. */
8860 static bool
8861 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8863 bool invert = false;
8865 size_t len = strlen (arg_str);
8867 if (len == 0)
8869 error ("malformed target %s", pragma_or_attr);
8870 return false;
8873 char *str_to_check = (char *) alloca (len + 1);
8874 strcpy (str_to_check, arg_str);
8876 /* Skip leading whitespace. */
8877 while (*str_to_check == ' ' || *str_to_check == '\t')
8878 str_to_check++;
8880 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8881 It is easier to detect and handle it explicitly here rather than going
8882 through the machinery for the rest of the target attributes in this
8883 function. */
8884 if (*str_to_check == '+')
8885 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8887 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8889 invert = true;
8890 str_to_check += 3;
8892 char *arg = strchr (str_to_check, '=');
8894 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8895 and point ARG to "foo". */
8896 if (arg)
8898 *arg = '\0';
8899 arg++;
8901 const struct aarch64_attribute_info *p_attr;
8902 bool found = false;
8903 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8905 /* If the names don't match up, or the user has given an argument
8906 to an attribute that doesn't accept one, or didn't give an argument
8907 to an attribute that expects one, fail to match. */
8908 if (strcmp (str_to_check, p_attr->name) != 0)
8909 continue;
8911 found = true;
8912 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8913 || p_attr->attr_type == aarch64_attr_enum;
8915 if (attr_need_arg_p ^ (arg != NULL))
8917 error ("target %s %qs does not accept an argument",
8918 pragma_or_attr, str_to_check);
8919 return false;
8922 /* If the name matches but the attribute does not allow "no-" versions
8923 then we can't match. */
8924 if (invert && !p_attr->allow_neg)
8926 error ("target %s %qs does not allow a negated form",
8927 pragma_or_attr, str_to_check);
8928 return false;
8931 switch (p_attr->attr_type)
8933 /* Has a custom handler registered.
8934 For example, cpu=, arch=, tune=. */
8935 case aarch64_attr_custom:
8936 gcc_assert (p_attr->handler);
8937 if (!p_attr->handler (arg, pragma_or_attr))
8938 return false;
8939 break;
8941 /* Either set or unset a boolean option. */
8942 case aarch64_attr_bool:
8944 struct cl_decoded_option decoded;
8946 generate_option (p_attr->opt_num, NULL, !invert,
8947 CL_TARGET, &decoded);
8948 aarch64_handle_option (&global_options, &global_options_set,
8949 &decoded, input_location);
8950 break;
8952 /* Set or unset a bit in the target_flags. aarch64_handle_option
8953 should know what mask to apply given the option number. */
8954 case aarch64_attr_mask:
8956 struct cl_decoded_option decoded;
8957 /* We only need to specify the option number.
8958 aarch64_handle_option will know which mask to apply. */
8959 decoded.opt_index = p_attr->opt_num;
8960 decoded.value = !invert;
8961 aarch64_handle_option (&global_options, &global_options_set,
8962 &decoded, input_location);
8963 break;
8965 /* Use the option setting machinery to set an option to an enum. */
8966 case aarch64_attr_enum:
8968 gcc_assert (arg);
8969 bool valid;
8970 int value;
8971 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8972 &value, CL_TARGET);
8973 if (valid)
8975 set_option (&global_options, NULL, p_attr->opt_num, value,
8976 NULL, DK_UNSPECIFIED, input_location,
8977 global_dc);
8979 else
8981 error ("target %s %s=%s is not valid",
8982 pragma_or_attr, str_to_check, arg);
8984 break;
8986 default:
8987 gcc_unreachable ();
8991 /* If we reached here we either have found an attribute and validated
8992 it or didn't match any. If we matched an attribute but its arguments
8993 were malformed we will have returned false already. */
8994 return found;
8997 /* Count how many times the character C appears in
8998 NULL-terminated string STR. */
9000 static unsigned int
9001 num_occurences_in_str (char c, char *str)
9003 unsigned int res = 0;
9004 while (*str != '\0')
9006 if (*str == c)
9007 res++;
9009 str++;
9012 return res;
9015 /* Parse the tree in ARGS that contains the target attribute information
9016 and update the global target options space. PRAGMA_OR_ATTR is a string
9017 to be used in error messages, specifying whether this is processing
9018 a target attribute or a target pragma. */
9020 bool
9021 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9023 if (TREE_CODE (args) == TREE_LIST)
9027 tree head = TREE_VALUE (args);
9028 if (head)
9030 if (!aarch64_process_target_attr (head, pragma_or_attr))
9031 return false;
9033 args = TREE_CHAIN (args);
9034 } while (args);
9036 return true;
9038 /* We expect to find a string to parse. */
9039 gcc_assert (TREE_CODE (args) == STRING_CST);
9041 size_t len = strlen (TREE_STRING_POINTER (args));
9042 char *str_to_check = (char *) alloca (len + 1);
9043 strcpy (str_to_check, TREE_STRING_POINTER (args));
9045 if (len == 0)
9047 error ("malformed target %s value", pragma_or_attr);
9048 return false;
9051 /* Used to catch empty spaces between commas i.e.
9052 attribute ((target ("attr1,,attr2"))). */
9053 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9055 /* Handle multiple target attributes separated by ','. */
9056 char *token = strtok (str_to_check, ",");
9058 unsigned int num_attrs = 0;
9059 while (token)
9061 num_attrs++;
9062 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9064 error ("target %s %qs is invalid", pragma_or_attr, token);
9065 return false;
9068 token = strtok (NULL, ",");
9071 if (num_attrs != num_commas + 1)
9073 error ("malformed target %s list %qs",
9074 pragma_or_attr, TREE_STRING_POINTER (args));
9075 return false;
9078 return true;
9081 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9082 process attribute ((target ("..."))). */
9084 static bool
9085 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9087 struct cl_target_option cur_target;
9088 bool ret;
9089 tree old_optimize;
9090 tree new_target, new_optimize;
9091 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9093 /* If what we're processing is the current pragma string then the
9094 target option node is already stored in target_option_current_node
9095 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9096 having to re-parse the string. This is especially useful to keep
9097 arm_neon.h compile times down since that header contains a lot
9098 of intrinsics enclosed in pragmas. */
9099 if (!existing_target && args == current_target_pragma)
9101 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9102 return true;
9104 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9106 old_optimize = build_optimization_node (&global_options);
9107 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9109 /* If the function changed the optimization levels as well as setting
9110 target options, start with the optimizations specified. */
9111 if (func_optimize && func_optimize != old_optimize)
9112 cl_optimization_restore (&global_options,
9113 TREE_OPTIMIZATION (func_optimize));
9115 /* Save the current target options to restore at the end. */
9116 cl_target_option_save (&cur_target, &global_options);
9118 /* If fndecl already has some target attributes applied to it, unpack
9119 them so that we add this attribute on top of them, rather than
9120 overwriting them. */
9121 if (existing_target)
9123 struct cl_target_option *existing_options
9124 = TREE_TARGET_OPTION (existing_target);
9126 if (existing_options)
9127 cl_target_option_restore (&global_options, existing_options);
9129 else
9130 cl_target_option_restore (&global_options,
9131 TREE_TARGET_OPTION (target_option_current_node));
9134 ret = aarch64_process_target_attr (args, "attribute");
9136 /* Set up any additional state. */
9137 if (ret)
9139 aarch64_override_options_internal (&global_options);
9140 /* Initialize SIMD builtins if we haven't already.
9141 Set current_target_pragma to NULL for the duration so that
9142 the builtin initialization code doesn't try to tag the functions
9143 being built with the attributes specified by any current pragma, thus
9144 going into an infinite recursion. */
9145 if (TARGET_SIMD)
9147 tree saved_current_target_pragma = current_target_pragma;
9148 current_target_pragma = NULL;
9149 aarch64_init_simd_builtins ();
9150 current_target_pragma = saved_current_target_pragma;
9152 new_target = build_target_option_node (&global_options);
9154 else
9155 new_target = NULL;
9157 new_optimize = build_optimization_node (&global_options);
9159 if (fndecl && ret)
9161 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9163 if (old_optimize != new_optimize)
9164 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9167 cl_target_option_restore (&global_options, &cur_target);
9169 if (old_optimize != new_optimize)
9170 cl_optimization_restore (&global_options,
9171 TREE_OPTIMIZATION (old_optimize));
9172 return ret;
9175 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9176 tri-bool options (yes, no, don't care) and the default value is
9177 DEF, determine whether to reject inlining. */
9179 static bool
9180 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9181 int dont_care, int def)
9183 /* If the callee doesn't care, always allow inlining. */
9184 if (callee == dont_care)
9185 return true;
9187 /* If the caller doesn't care, always allow inlining. */
9188 if (caller == dont_care)
9189 return true;
9191 /* Otherwise, allow inlining if either the callee and caller values
9192 agree, or if the callee is using the default value. */
9193 return (callee == caller || callee == def);
9196 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9197 to inline CALLEE into CALLER based on target-specific info.
9198 Make sure that the caller and callee have compatible architectural
9199 features. Then go through the other possible target attributes
9200 and see if they can block inlining. Try not to reject always_inline
9201 callees unless they are incompatible architecturally. */
9203 static bool
9204 aarch64_can_inline_p (tree caller, tree callee)
9206 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9207 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9209 /* If callee has no option attributes, then it is ok to inline. */
9210 if (!callee_tree)
9211 return true;
9213 struct cl_target_option *caller_opts
9214 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9215 : target_option_default_node);
9217 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9220 /* Callee's ISA flags should be a subset of the caller's. */
9221 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9222 != callee_opts->x_aarch64_isa_flags)
9223 return false;
9225 /* Allow non-strict aligned functions inlining into strict
9226 aligned ones. */
9227 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9228 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9229 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9230 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9231 return false;
9233 bool always_inline = lookup_attribute ("always_inline",
9234 DECL_ATTRIBUTES (callee));
9236 /* If the architectural features match up and the callee is always_inline
9237 then the other attributes don't matter. */
9238 if (always_inline)
9239 return true;
9241 if (caller_opts->x_aarch64_cmodel_var
9242 != callee_opts->x_aarch64_cmodel_var)
9243 return false;
9245 if (caller_opts->x_aarch64_tls_dialect
9246 != callee_opts->x_aarch64_tls_dialect)
9247 return false;
9249 /* Honour explicit requests to workaround errata. */
9250 if (!aarch64_tribools_ok_for_inlining_p (
9251 caller_opts->x_aarch64_fix_a53_err835769,
9252 callee_opts->x_aarch64_fix_a53_err835769,
9253 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9254 return false;
9256 if (!aarch64_tribools_ok_for_inlining_p (
9257 caller_opts->x_aarch64_fix_a53_err843419,
9258 callee_opts->x_aarch64_fix_a53_err843419,
9259 2, TARGET_FIX_ERR_A53_843419))
9260 return false;
9262 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9263 caller and calle and they don't match up, reject inlining. */
9264 if (!aarch64_tribools_ok_for_inlining_p (
9265 caller_opts->x_flag_omit_leaf_frame_pointer,
9266 callee_opts->x_flag_omit_leaf_frame_pointer,
9267 2, 1))
9268 return false;
9270 /* If the callee has specific tuning overrides, respect them. */
9271 if (callee_opts->x_aarch64_override_tune_string != NULL
9272 && caller_opts->x_aarch64_override_tune_string == NULL)
9273 return false;
9275 /* If the user specified tuning override strings for the
9276 caller and callee and they don't match up, reject inlining.
9277 We just do a string compare here, we don't analyze the meaning
9278 of the string, as it would be too costly for little gain. */
9279 if (callee_opts->x_aarch64_override_tune_string
9280 && caller_opts->x_aarch64_override_tune_string
9281 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9282 caller_opts->x_aarch64_override_tune_string) != 0))
9283 return false;
9285 return true;
9288 /* Return true if SYMBOL_REF X binds locally. */
9290 static bool
9291 aarch64_symbol_binds_local_p (const_rtx x)
9293 return (SYMBOL_REF_DECL (x)
9294 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9295 : SYMBOL_REF_LOCAL_P (x));
9298 /* Return true if SYMBOL_REF X is thread local */
9299 static bool
9300 aarch64_tls_symbol_p (rtx x)
9302 if (! TARGET_HAVE_TLS)
9303 return false;
9305 if (GET_CODE (x) != SYMBOL_REF)
9306 return false;
9308 return SYMBOL_REF_TLS_MODEL (x) != 0;
9311 /* Classify a TLS symbol into one of the TLS kinds. */
9312 enum aarch64_symbol_type
9313 aarch64_classify_tls_symbol (rtx x)
9315 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9317 switch (tls_kind)
9319 case TLS_MODEL_GLOBAL_DYNAMIC:
9320 case TLS_MODEL_LOCAL_DYNAMIC:
9321 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9323 case TLS_MODEL_INITIAL_EXEC:
9324 switch (aarch64_cmodel)
9326 case AARCH64_CMODEL_TINY:
9327 case AARCH64_CMODEL_TINY_PIC:
9328 return SYMBOL_TINY_TLSIE;
9329 default:
9330 return SYMBOL_SMALL_TLSIE;
9333 case TLS_MODEL_LOCAL_EXEC:
9334 if (aarch64_tls_size == 12)
9335 return SYMBOL_TLSLE12;
9336 else if (aarch64_tls_size == 24)
9337 return SYMBOL_TLSLE24;
9338 else if (aarch64_tls_size == 32)
9339 return SYMBOL_TLSLE32;
9340 else if (aarch64_tls_size == 48)
9341 return SYMBOL_TLSLE48;
9342 else
9343 gcc_unreachable ();
9345 case TLS_MODEL_EMULATED:
9346 case TLS_MODEL_NONE:
9347 return SYMBOL_FORCE_TO_MEM;
9349 default:
9350 gcc_unreachable ();
9354 /* Return the method that should be used to access SYMBOL_REF or
9355 LABEL_REF X. */
9357 enum aarch64_symbol_type
9358 aarch64_classify_symbol (rtx x, rtx offset)
9360 if (GET_CODE (x) == LABEL_REF)
9362 switch (aarch64_cmodel)
9364 case AARCH64_CMODEL_LARGE:
9365 return SYMBOL_FORCE_TO_MEM;
9367 case AARCH64_CMODEL_TINY_PIC:
9368 case AARCH64_CMODEL_TINY:
9369 return SYMBOL_TINY_ABSOLUTE;
9371 case AARCH64_CMODEL_SMALL_SPIC:
9372 case AARCH64_CMODEL_SMALL_PIC:
9373 case AARCH64_CMODEL_SMALL:
9374 return SYMBOL_SMALL_ABSOLUTE;
9376 default:
9377 gcc_unreachable ();
9381 if (GET_CODE (x) == SYMBOL_REF)
9383 if (aarch64_tls_symbol_p (x))
9384 return aarch64_classify_tls_symbol (x);
9386 switch (aarch64_cmodel)
9388 case AARCH64_CMODEL_TINY:
9389 /* When we retrieve symbol + offset address, we have to make sure
9390 the offset does not cause overflow of the final address. But
9391 we have no way of knowing the address of symbol at compile time
9392 so we can't accurately say if the distance between the PC and
9393 symbol + offset is outside the addressible range of +/-1M in the
9394 TINY code model. So we rely on images not being greater than
9395 1M and cap the offset at 1M and anything beyond 1M will have to
9396 be loaded using an alternative mechanism. Furthermore if the
9397 symbol is a weak reference to something that isn't known to
9398 resolve to a symbol in this module, then force to memory. */
9399 if ((SYMBOL_REF_WEAK (x)
9400 && !aarch64_symbol_binds_local_p (x))
9401 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9402 return SYMBOL_FORCE_TO_MEM;
9403 return SYMBOL_TINY_ABSOLUTE;
9405 case AARCH64_CMODEL_SMALL:
9406 /* Same reasoning as the tiny code model, but the offset cap here is
9407 4G. */
9408 if ((SYMBOL_REF_WEAK (x)
9409 && !aarch64_symbol_binds_local_p (x))
9410 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9411 HOST_WIDE_INT_C (4294967264)))
9412 return SYMBOL_FORCE_TO_MEM;
9413 return SYMBOL_SMALL_ABSOLUTE;
9415 case AARCH64_CMODEL_TINY_PIC:
9416 if (!aarch64_symbol_binds_local_p (x))
9417 return SYMBOL_TINY_GOT;
9418 return SYMBOL_TINY_ABSOLUTE;
9420 case AARCH64_CMODEL_SMALL_SPIC:
9421 case AARCH64_CMODEL_SMALL_PIC:
9422 if (!aarch64_symbol_binds_local_p (x))
9423 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9424 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9425 return SYMBOL_SMALL_ABSOLUTE;
9427 case AARCH64_CMODEL_LARGE:
9428 /* This is alright even in PIC code as the constant
9429 pool reference is always PC relative and within
9430 the same translation unit. */
9431 if (CONSTANT_POOL_ADDRESS_P (x))
9432 return SYMBOL_SMALL_ABSOLUTE;
9433 else
9434 return SYMBOL_FORCE_TO_MEM;
9436 default:
9437 gcc_unreachable ();
9441 /* By default push everything into the constant pool. */
9442 return SYMBOL_FORCE_TO_MEM;
9445 bool
9446 aarch64_constant_address_p (rtx x)
9448 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9451 bool
9452 aarch64_legitimate_pic_operand_p (rtx x)
9454 if (GET_CODE (x) == SYMBOL_REF
9455 || (GET_CODE (x) == CONST
9456 && GET_CODE (XEXP (x, 0)) == PLUS
9457 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9458 return false;
9460 return true;
9463 /* Return true if X holds either a quarter-precision or
9464 floating-point +0.0 constant. */
9465 static bool
9466 aarch64_valid_floating_const (machine_mode mode, rtx x)
9468 if (!CONST_DOUBLE_P (x))
9469 return false;
9471 if (aarch64_float_const_zero_rtx_p (x))
9472 return true;
9474 /* We only handle moving 0.0 to a TFmode register. */
9475 if (!(mode == SFmode || mode == DFmode))
9476 return false;
9478 return aarch64_float_const_representable_p (x);
9481 static bool
9482 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9484 /* Do not allow vector struct mode constants. We could support
9485 0 and -1 easily, but they need support in aarch64-simd.md. */
9486 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9487 return false;
9489 /* This could probably go away because
9490 we now decompose CONST_INTs according to expand_mov_immediate. */
9491 if ((GET_CODE (x) == CONST_VECTOR
9492 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9493 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9494 return !targetm.cannot_force_const_mem (mode, x);
9496 if (GET_CODE (x) == HIGH
9497 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9498 return true;
9500 return aarch64_constant_address_p (x);
9504 aarch64_load_tp (rtx target)
9506 if (!target
9507 || GET_MODE (target) != Pmode
9508 || !register_operand (target, Pmode))
9509 target = gen_reg_rtx (Pmode);
9511 /* Can return in any reg. */
9512 emit_insn (gen_aarch64_load_tp_hard (target));
9513 return target;
9516 /* On AAPCS systems, this is the "struct __va_list". */
9517 static GTY(()) tree va_list_type;
9519 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9520 Return the type to use as __builtin_va_list.
9522 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9524 struct __va_list
9526 void *__stack;
9527 void *__gr_top;
9528 void *__vr_top;
9529 int __gr_offs;
9530 int __vr_offs;
9531 }; */
9533 static tree
9534 aarch64_build_builtin_va_list (void)
9536 tree va_list_name;
9537 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9539 /* Create the type. */
9540 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9541 /* Give it the required name. */
9542 va_list_name = build_decl (BUILTINS_LOCATION,
9543 TYPE_DECL,
9544 get_identifier ("__va_list"),
9545 va_list_type);
9546 DECL_ARTIFICIAL (va_list_name) = 1;
9547 TYPE_NAME (va_list_type) = va_list_name;
9548 TYPE_STUB_DECL (va_list_type) = va_list_name;
9550 /* Create the fields. */
9551 f_stack = build_decl (BUILTINS_LOCATION,
9552 FIELD_DECL, get_identifier ("__stack"),
9553 ptr_type_node);
9554 f_grtop = build_decl (BUILTINS_LOCATION,
9555 FIELD_DECL, get_identifier ("__gr_top"),
9556 ptr_type_node);
9557 f_vrtop = build_decl (BUILTINS_LOCATION,
9558 FIELD_DECL, get_identifier ("__vr_top"),
9559 ptr_type_node);
9560 f_groff = build_decl (BUILTINS_LOCATION,
9561 FIELD_DECL, get_identifier ("__gr_offs"),
9562 integer_type_node);
9563 f_vroff = build_decl (BUILTINS_LOCATION,
9564 FIELD_DECL, get_identifier ("__vr_offs"),
9565 integer_type_node);
9567 /* Tell tree-stdarg pass about our internal offset fields.
9568 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9569 purpose to identify whether the code is updating va_list internal
9570 offset fields through irregular way. */
9571 va_list_gpr_counter_field = f_groff;
9572 va_list_fpr_counter_field = f_vroff;
9574 DECL_ARTIFICIAL (f_stack) = 1;
9575 DECL_ARTIFICIAL (f_grtop) = 1;
9576 DECL_ARTIFICIAL (f_vrtop) = 1;
9577 DECL_ARTIFICIAL (f_groff) = 1;
9578 DECL_ARTIFICIAL (f_vroff) = 1;
9580 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9581 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9582 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9583 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9584 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9586 TYPE_FIELDS (va_list_type) = f_stack;
9587 DECL_CHAIN (f_stack) = f_grtop;
9588 DECL_CHAIN (f_grtop) = f_vrtop;
9589 DECL_CHAIN (f_vrtop) = f_groff;
9590 DECL_CHAIN (f_groff) = f_vroff;
9592 /* Compute its layout. */
9593 layout_type (va_list_type);
9595 return va_list_type;
9598 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9599 static void
9600 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9602 const CUMULATIVE_ARGS *cum;
9603 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9604 tree stack, grtop, vrtop, groff, vroff;
9605 tree t;
9606 int gr_save_area_size = cfun->va_list_gpr_size;
9607 int vr_save_area_size = cfun->va_list_fpr_size;
9608 int vr_offset;
9610 cum = &crtl->args.info;
9611 if (cfun->va_list_gpr_size)
9612 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9613 cfun->va_list_gpr_size);
9614 if (cfun->va_list_fpr_size)
9615 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9616 * UNITS_PER_VREG, cfun->va_list_fpr_size);
9618 if (!TARGET_FLOAT)
9620 gcc_assert (cum->aapcs_nvrn == 0);
9621 vr_save_area_size = 0;
9624 f_stack = TYPE_FIELDS (va_list_type_node);
9625 f_grtop = DECL_CHAIN (f_stack);
9626 f_vrtop = DECL_CHAIN (f_grtop);
9627 f_groff = DECL_CHAIN (f_vrtop);
9628 f_vroff = DECL_CHAIN (f_groff);
9630 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9631 NULL_TREE);
9632 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9633 NULL_TREE);
9634 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9635 NULL_TREE);
9636 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9637 NULL_TREE);
9638 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9639 NULL_TREE);
9641 /* Emit code to initialize STACK, which points to the next varargs stack
9642 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9643 by named arguments. STACK is 8-byte aligned. */
9644 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9645 if (cum->aapcs_stack_size > 0)
9646 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9647 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9648 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9650 /* Emit code to initialize GRTOP, the top of the GR save area.
9651 virtual_incoming_args_rtx should have been 16 byte aligned. */
9652 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9653 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9654 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9656 /* Emit code to initialize VRTOP, the top of the VR save area.
9657 This address is gr_save_area_bytes below GRTOP, rounded
9658 down to the next 16-byte boundary. */
9659 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9660 vr_offset = ROUND_UP (gr_save_area_size,
9661 STACK_BOUNDARY / BITS_PER_UNIT);
9663 if (vr_offset)
9664 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9665 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9666 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9668 /* Emit code to initialize GROFF, the offset from GRTOP of the
9669 next GPR argument. */
9670 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9671 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9672 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9674 /* Likewise emit code to initialize VROFF, the offset from FTOP
9675 of the next VR argument. */
9676 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9677 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9678 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9681 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9683 static tree
9684 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9685 gimple_seq *post_p ATTRIBUTE_UNUSED)
9687 tree addr;
9688 bool indirect_p;
9689 bool is_ha; /* is HFA or HVA. */
9690 bool dw_align; /* double-word align. */
9691 machine_mode ag_mode = VOIDmode;
9692 int nregs;
9693 machine_mode mode;
9695 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9696 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9697 HOST_WIDE_INT size, rsize, adjust, align;
9698 tree t, u, cond1, cond2;
9700 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9701 if (indirect_p)
9702 type = build_pointer_type (type);
9704 mode = TYPE_MODE (type);
9706 f_stack = TYPE_FIELDS (va_list_type_node);
9707 f_grtop = DECL_CHAIN (f_stack);
9708 f_vrtop = DECL_CHAIN (f_grtop);
9709 f_groff = DECL_CHAIN (f_vrtop);
9710 f_vroff = DECL_CHAIN (f_groff);
9712 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9713 f_stack, NULL_TREE);
9714 size = int_size_in_bytes (type);
9715 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9717 dw_align = false;
9718 adjust = 0;
9719 if (aarch64_vfp_is_call_or_return_candidate (mode,
9720 type,
9721 &ag_mode,
9722 &nregs,
9723 &is_ha))
9725 /* TYPE passed in fp/simd registers. */
9726 if (!TARGET_FLOAT)
9727 aarch64_err_no_fpadvsimd (mode, "varargs");
9729 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9730 unshare_expr (valist), f_vrtop, NULL_TREE);
9731 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9732 unshare_expr (valist), f_vroff, NULL_TREE);
9734 rsize = nregs * UNITS_PER_VREG;
9736 if (is_ha)
9738 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9739 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9741 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9742 && size < UNITS_PER_VREG)
9744 adjust = UNITS_PER_VREG - size;
9747 else
9749 /* TYPE passed in general registers. */
9750 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9751 unshare_expr (valist), f_grtop, NULL_TREE);
9752 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9753 unshare_expr (valist), f_groff, NULL_TREE);
9754 rsize = ROUND_UP (size, UNITS_PER_WORD);
9755 nregs = rsize / UNITS_PER_WORD;
9757 if (align > 8)
9758 dw_align = true;
9760 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9761 && size < UNITS_PER_WORD)
9763 adjust = UNITS_PER_WORD - size;
9767 /* Get a local temporary for the field value. */
9768 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9770 /* Emit code to branch if off >= 0. */
9771 t = build2 (GE_EXPR, boolean_type_node, off,
9772 build_int_cst (TREE_TYPE (off), 0));
9773 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9775 if (dw_align)
9777 /* Emit: offs = (offs + 15) & -16. */
9778 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9779 build_int_cst (TREE_TYPE (off), 15));
9780 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9781 build_int_cst (TREE_TYPE (off), -16));
9782 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9784 else
9785 roundup = NULL;
9787 /* Update ap.__[g|v]r_offs */
9788 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9789 build_int_cst (TREE_TYPE (off), rsize));
9790 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9792 /* String up. */
9793 if (roundup)
9794 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9796 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9797 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9798 build_int_cst (TREE_TYPE (f_off), 0));
9799 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9801 /* String up: make sure the assignment happens before the use. */
9802 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9803 COND_EXPR_ELSE (cond1) = t;
9805 /* Prepare the trees handling the argument that is passed on the stack;
9806 the top level node will store in ON_STACK. */
9807 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9808 if (align > 8)
9810 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9811 t = fold_convert (intDI_type_node, arg);
9812 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9813 build_int_cst (TREE_TYPE (t), 15));
9814 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9815 build_int_cst (TREE_TYPE (t), -16));
9816 t = fold_convert (TREE_TYPE (arg), t);
9817 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9819 else
9820 roundup = NULL;
9821 /* Advance ap.__stack */
9822 t = fold_convert (intDI_type_node, arg);
9823 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9824 build_int_cst (TREE_TYPE (t), size + 7));
9825 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9826 build_int_cst (TREE_TYPE (t), -8));
9827 t = fold_convert (TREE_TYPE (arg), t);
9828 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9829 /* String up roundup and advance. */
9830 if (roundup)
9831 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9832 /* String up with arg */
9833 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9834 /* Big-endianness related address adjustment. */
9835 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9836 && size < UNITS_PER_WORD)
9838 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9839 size_int (UNITS_PER_WORD - size));
9840 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9843 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9844 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9846 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9847 t = off;
9848 if (adjust)
9849 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9850 build_int_cst (TREE_TYPE (off), adjust));
9852 t = fold_convert (sizetype, t);
9853 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9855 if (is_ha)
9857 /* type ha; // treat as "struct {ftype field[n];}"
9858 ... [computing offs]
9859 for (i = 0; i <nregs; ++i, offs += 16)
9860 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9861 return ha; */
9862 int i;
9863 tree tmp_ha, field_t, field_ptr_t;
9865 /* Declare a local variable. */
9866 tmp_ha = create_tmp_var_raw (type, "ha");
9867 gimple_add_tmp_var (tmp_ha);
9869 /* Establish the base type. */
9870 switch (ag_mode)
9872 case SFmode:
9873 field_t = float_type_node;
9874 field_ptr_t = float_ptr_type_node;
9875 break;
9876 case DFmode:
9877 field_t = double_type_node;
9878 field_ptr_t = double_ptr_type_node;
9879 break;
9880 case TFmode:
9881 field_t = long_double_type_node;
9882 field_ptr_t = long_double_ptr_type_node;
9883 break;
9884 /* The half precision and quad precision are not fully supported yet. Enable
9885 the following code after the support is complete. Need to find the correct
9886 type node for __fp16 *. */
9887 #if 0
9888 case HFmode:
9889 field_t = float_type_node;
9890 field_ptr_t = float_ptr_type_node;
9891 break;
9892 #endif
9893 case V2SImode:
9894 case V4SImode:
9896 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9897 field_t = build_vector_type_for_mode (innertype, ag_mode);
9898 field_ptr_t = build_pointer_type (field_t);
9900 break;
9901 default:
9902 gcc_assert (0);
9905 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
9906 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9907 addr = t;
9908 t = fold_convert (field_ptr_t, addr);
9909 t = build2 (MODIFY_EXPR, field_t,
9910 build1 (INDIRECT_REF, field_t, tmp_ha),
9911 build1 (INDIRECT_REF, field_t, t));
9913 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
9914 for (i = 1; i < nregs; ++i)
9916 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9917 u = fold_convert (field_ptr_t, addr);
9918 u = build2 (MODIFY_EXPR, field_t,
9919 build2 (MEM_REF, field_t, tmp_ha,
9920 build_int_cst (field_ptr_t,
9921 (i *
9922 int_size_in_bytes (field_t)))),
9923 build1 (INDIRECT_REF, field_t, u));
9924 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9927 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9928 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9931 COND_EXPR_ELSE (cond2) = t;
9932 addr = fold_convert (build_pointer_type (type), cond1);
9933 addr = build_va_arg_indirect_ref (addr);
9935 if (indirect_p)
9936 addr = build_va_arg_indirect_ref (addr);
9938 return addr;
9941 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
9943 static void
9944 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9945 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9946 int no_rtl)
9948 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9949 CUMULATIVE_ARGS local_cum;
9950 int gr_saved = cfun->va_list_gpr_size;
9951 int vr_saved = cfun->va_list_fpr_size;
9953 /* The caller has advanced CUM up to, but not beyond, the last named
9954 argument. Advance a local copy of CUM past the last "real" named
9955 argument, to find out how many registers are left over. */
9956 local_cum = *cum;
9957 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9959 /* Found out how many registers we need to save.
9960 Honor tree-stdvar analysis results. */
9961 if (cfun->va_list_gpr_size)
9962 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
9963 cfun->va_list_gpr_size / UNITS_PER_WORD);
9964 if (cfun->va_list_fpr_size)
9965 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
9966 cfun->va_list_fpr_size / UNITS_PER_VREG);
9968 if (!TARGET_FLOAT)
9970 gcc_assert (local_cum.aapcs_nvrn == 0);
9971 vr_saved = 0;
9974 if (!no_rtl)
9976 if (gr_saved > 0)
9978 rtx ptr, mem;
9980 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
9981 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9982 - gr_saved * UNITS_PER_WORD);
9983 mem = gen_frame_mem (BLKmode, ptr);
9984 set_mem_alias_set (mem, get_varargs_alias_set ());
9986 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9987 mem, gr_saved);
9989 if (vr_saved > 0)
9991 /* We can't use move_block_from_reg, because it will use
9992 the wrong mode, storing D regs only. */
9993 machine_mode mode = TImode;
9994 int off, i, vr_start;
9996 /* Set OFF to the offset from virtual_incoming_args_rtx of
9997 the first vector register. The VR save area lies below
9998 the GR one, and is aligned to 16 bytes. */
9999 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10000 STACK_BOUNDARY / BITS_PER_UNIT);
10001 off -= vr_saved * UNITS_PER_VREG;
10003 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10004 for (i = 0; i < vr_saved; ++i)
10006 rtx ptr, mem;
10008 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10009 mem = gen_frame_mem (mode, ptr);
10010 set_mem_alias_set (mem, get_varargs_alias_set ());
10011 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10012 off += UNITS_PER_VREG;
10017 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10018 any complication of having crtl->args.pretend_args_size changed. */
10019 cfun->machine->frame.saved_varargs_size
10020 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10021 STACK_BOUNDARY / BITS_PER_UNIT)
10022 + vr_saved * UNITS_PER_VREG);
10025 static void
10026 aarch64_conditional_register_usage (void)
10028 int i;
10029 if (!TARGET_FLOAT)
10031 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10033 fixed_regs[i] = 1;
10034 call_used_regs[i] = 1;
10039 /* Walk down the type tree of TYPE counting consecutive base elements.
10040 If *MODEP is VOIDmode, then set it to the first valid floating point
10041 type. If a non-floating point type is found, or if a floating point
10042 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10043 otherwise return the count in the sub-tree. */
10044 static int
10045 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10047 machine_mode mode;
10048 HOST_WIDE_INT size;
10050 switch (TREE_CODE (type))
10052 case REAL_TYPE:
10053 mode = TYPE_MODE (type);
10054 if (mode != DFmode && mode != SFmode && mode != TFmode)
10055 return -1;
10057 if (*modep == VOIDmode)
10058 *modep = mode;
10060 if (*modep == mode)
10061 return 1;
10063 break;
10065 case COMPLEX_TYPE:
10066 mode = TYPE_MODE (TREE_TYPE (type));
10067 if (mode != DFmode && mode != SFmode && mode != TFmode)
10068 return -1;
10070 if (*modep == VOIDmode)
10071 *modep = mode;
10073 if (*modep == mode)
10074 return 2;
10076 break;
10078 case VECTOR_TYPE:
10079 /* Use V2SImode and V4SImode as representatives of all 64-bit
10080 and 128-bit vector types. */
10081 size = int_size_in_bytes (type);
10082 switch (size)
10084 case 8:
10085 mode = V2SImode;
10086 break;
10087 case 16:
10088 mode = V4SImode;
10089 break;
10090 default:
10091 return -1;
10094 if (*modep == VOIDmode)
10095 *modep = mode;
10097 /* Vector modes are considered to be opaque: two vectors are
10098 equivalent for the purposes of being homogeneous aggregates
10099 if they are the same size. */
10100 if (*modep == mode)
10101 return 1;
10103 break;
10105 case ARRAY_TYPE:
10107 int count;
10108 tree index = TYPE_DOMAIN (type);
10110 /* Can't handle incomplete types nor sizes that are not
10111 fixed. */
10112 if (!COMPLETE_TYPE_P (type)
10113 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10114 return -1;
10116 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10117 if (count == -1
10118 || !index
10119 || !TYPE_MAX_VALUE (index)
10120 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10121 || !TYPE_MIN_VALUE (index)
10122 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10123 || count < 0)
10124 return -1;
10126 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10127 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10129 /* There must be no padding. */
10130 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10131 return -1;
10133 return count;
10136 case RECORD_TYPE:
10138 int count = 0;
10139 int sub_count;
10140 tree field;
10142 /* Can't handle incomplete types nor sizes that are not
10143 fixed. */
10144 if (!COMPLETE_TYPE_P (type)
10145 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10146 return -1;
10148 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10150 if (TREE_CODE (field) != FIELD_DECL)
10151 continue;
10153 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10154 if (sub_count < 0)
10155 return -1;
10156 count += sub_count;
10159 /* There must be no padding. */
10160 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10161 return -1;
10163 return count;
10166 case UNION_TYPE:
10167 case QUAL_UNION_TYPE:
10169 /* These aren't very interesting except in a degenerate case. */
10170 int count = 0;
10171 int sub_count;
10172 tree field;
10174 /* Can't handle incomplete types nor sizes that are not
10175 fixed. */
10176 if (!COMPLETE_TYPE_P (type)
10177 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10178 return -1;
10180 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10182 if (TREE_CODE (field) != FIELD_DECL)
10183 continue;
10185 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10186 if (sub_count < 0)
10187 return -1;
10188 count = count > sub_count ? count : sub_count;
10191 /* There must be no padding. */
10192 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10193 return -1;
10195 return count;
10198 default:
10199 break;
10202 return -1;
10205 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10206 type as described in AAPCS64 \S 4.1.2.
10208 See the comment above aarch64_composite_type_p for the notes on MODE. */
10210 static bool
10211 aarch64_short_vector_p (const_tree type,
10212 machine_mode mode)
10214 HOST_WIDE_INT size = -1;
10216 if (type && TREE_CODE (type) == VECTOR_TYPE)
10217 size = int_size_in_bytes (type);
10218 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10219 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10220 size = GET_MODE_SIZE (mode);
10222 return (size == 8 || size == 16);
10225 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10226 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10227 array types. The C99 floating-point complex types are also considered
10228 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10229 types, which are GCC extensions and out of the scope of AAPCS64, are
10230 treated as composite types here as well.
10232 Note that MODE itself is not sufficient in determining whether a type
10233 is such a composite type or not. This is because
10234 stor-layout.c:compute_record_mode may have already changed the MODE
10235 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10236 structure with only one field may have its MODE set to the mode of the
10237 field. Also an integer mode whose size matches the size of the
10238 RECORD_TYPE type may be used to substitute the original mode
10239 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10240 solely relied on. */
10242 static bool
10243 aarch64_composite_type_p (const_tree type,
10244 machine_mode mode)
10246 if (aarch64_short_vector_p (type, mode))
10247 return false;
10249 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10250 return true;
10252 if (mode == BLKmode
10253 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10254 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10255 return true;
10257 return false;
10260 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10261 shall be passed or returned in simd/fp register(s) (providing these
10262 parameter passing registers are available).
10264 Upon successful return, *COUNT returns the number of needed registers,
10265 *BASE_MODE returns the mode of the individual register and when IS_HAF
10266 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10267 floating-point aggregate or a homogeneous short-vector aggregate. */
10269 static bool
10270 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10271 const_tree type,
10272 machine_mode *base_mode,
10273 int *count,
10274 bool *is_ha)
10276 machine_mode new_mode = VOIDmode;
10277 bool composite_p = aarch64_composite_type_p (type, mode);
10279 if (is_ha != NULL) *is_ha = false;
10281 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10282 || aarch64_short_vector_p (type, mode))
10284 *count = 1;
10285 new_mode = mode;
10287 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10289 if (is_ha != NULL) *is_ha = true;
10290 *count = 2;
10291 new_mode = GET_MODE_INNER (mode);
10293 else if (type && composite_p)
10295 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10297 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10299 if (is_ha != NULL) *is_ha = true;
10300 *count = ag_count;
10302 else
10303 return false;
10305 else
10306 return false;
10308 *base_mode = new_mode;
10309 return true;
10312 /* Implement TARGET_STRUCT_VALUE_RTX. */
10314 static rtx
10315 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10316 int incoming ATTRIBUTE_UNUSED)
10318 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10321 /* Implements target hook vector_mode_supported_p. */
10322 static bool
10323 aarch64_vector_mode_supported_p (machine_mode mode)
10325 if (TARGET_SIMD
10326 && (mode == V4SImode || mode == V8HImode
10327 || mode == V16QImode || mode == V2DImode
10328 || mode == V2SImode || mode == V4HImode
10329 || mode == V8QImode || mode == V2SFmode
10330 || mode == V4SFmode || mode == V2DFmode
10331 || mode == V4HFmode || mode == V8HFmode
10332 || mode == V1DFmode))
10333 return true;
10335 return false;
10338 /* Return appropriate SIMD container
10339 for MODE within a vector of WIDTH bits. */
10340 static machine_mode
10341 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10343 gcc_assert (width == 64 || width == 128);
10344 if (TARGET_SIMD)
10346 if (width == 128)
10347 switch (mode)
10349 case DFmode:
10350 return V2DFmode;
10351 case SFmode:
10352 return V4SFmode;
10353 case SImode:
10354 return V4SImode;
10355 case HImode:
10356 return V8HImode;
10357 case QImode:
10358 return V16QImode;
10359 case DImode:
10360 return V2DImode;
10361 default:
10362 break;
10364 else
10365 switch (mode)
10367 case SFmode:
10368 return V2SFmode;
10369 case SImode:
10370 return V2SImode;
10371 case HImode:
10372 return V4HImode;
10373 case QImode:
10374 return V8QImode;
10375 default:
10376 break;
10379 return word_mode;
10382 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10383 static machine_mode
10384 aarch64_preferred_simd_mode (machine_mode mode)
10386 return aarch64_simd_container_mode (mode, 128);
10389 /* Return the bitmask of possible vector sizes for the vectorizer
10390 to iterate over. */
10391 static unsigned int
10392 aarch64_autovectorize_vector_sizes (void)
10394 return (16 | 8);
10397 /* Implement TARGET_MANGLE_TYPE. */
10399 static const char *
10400 aarch64_mangle_type (const_tree type)
10402 /* The AArch64 ABI documents say that "__va_list" has to be
10403 managled as if it is in the "std" namespace. */
10404 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10405 return "St9__va_list";
10407 /* Half-precision float. */
10408 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10409 return "Dh";
10411 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10412 builtin types. */
10413 if (TYPE_NAME (type) != NULL)
10414 return aarch64_mangle_builtin_type (type);
10416 /* Use the default mangling. */
10417 return NULL;
10421 /* Return true if the rtx_insn contains a MEM RTX somewhere
10422 in it. */
10424 static bool
10425 has_memory_op (rtx_insn *mem_insn)
10427 subrtx_iterator::array_type array;
10428 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10429 if (MEM_P (*iter))
10430 return true;
10432 return false;
10435 /* Find the first rtx_insn before insn that will generate an assembly
10436 instruction. */
10438 static rtx_insn *
10439 aarch64_prev_real_insn (rtx_insn *insn)
10441 if (!insn)
10442 return NULL;
10446 insn = prev_real_insn (insn);
10448 while (insn && recog_memoized (insn) < 0);
10450 return insn;
10453 static bool
10454 is_madd_op (enum attr_type t1)
10456 unsigned int i;
10457 /* A number of these may be AArch32 only. */
10458 enum attr_type mlatypes[] = {
10459 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10460 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10461 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10464 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10466 if (t1 == mlatypes[i])
10467 return true;
10470 return false;
10473 /* Check if there is a register dependency between a load and the insn
10474 for which we hold recog_data. */
10476 static bool
10477 dep_between_memop_and_curr (rtx memop)
10479 rtx load_reg;
10480 int opno;
10482 gcc_assert (GET_CODE (memop) == SET);
10484 if (!REG_P (SET_DEST (memop)))
10485 return false;
10487 load_reg = SET_DEST (memop);
10488 for (opno = 1; opno < recog_data.n_operands; opno++)
10490 rtx operand = recog_data.operand[opno];
10491 if (REG_P (operand)
10492 && reg_overlap_mentioned_p (load_reg, operand))
10493 return true;
10496 return false;
10500 /* When working around the Cortex-A53 erratum 835769,
10501 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10502 instruction and has a preceding memory instruction such that a NOP
10503 should be inserted between them. */
10505 bool
10506 aarch64_madd_needs_nop (rtx_insn* insn)
10508 enum attr_type attr_type;
10509 rtx_insn *prev;
10510 rtx body;
10512 if (!TARGET_FIX_ERR_A53_835769)
10513 return false;
10515 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10516 return false;
10518 attr_type = get_attr_type (insn);
10519 if (!is_madd_op (attr_type))
10520 return false;
10522 prev = aarch64_prev_real_insn (insn);
10523 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10524 Restore recog state to INSN to avoid state corruption. */
10525 extract_constrain_insn_cached (insn);
10527 if (!prev || !has_memory_op (prev))
10528 return false;
10530 body = single_set (prev);
10532 /* If the previous insn is a memory op and there is no dependency between
10533 it and the DImode madd, emit a NOP between them. If body is NULL then we
10534 have a complex memory operation, probably a load/store pair.
10535 Be conservative for now and emit a NOP. */
10536 if (GET_MODE (recog_data.operand[0]) == DImode
10537 && (!body || !dep_between_memop_and_curr (body)))
10538 return true;
10540 return false;
10545 /* Implement FINAL_PRESCAN_INSN. */
10547 void
10548 aarch64_final_prescan_insn (rtx_insn *insn)
10550 if (aarch64_madd_needs_nop (insn))
10551 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10555 /* Return the equivalent letter for size. */
10556 static char
10557 sizetochar (int size)
10559 switch (size)
10561 case 64: return 'd';
10562 case 32: return 's';
10563 case 16: return 'h';
10564 case 8 : return 'b';
10565 default: gcc_unreachable ();
10569 /* Return true iff x is a uniform vector of floating-point
10570 constants, and the constant can be represented in
10571 quarter-precision form. Note, as aarch64_float_const_representable
10572 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10573 static bool
10574 aarch64_vect_float_const_representable_p (rtx x)
10576 rtx elt;
10577 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10578 && const_vec_duplicate_p (x, &elt)
10579 && aarch64_float_const_representable_p (elt));
10582 /* Return true for valid and false for invalid. */
10583 bool
10584 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10585 struct simd_immediate_info *info)
10587 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10588 matches = 1; \
10589 for (i = 0; i < idx; i += (STRIDE)) \
10590 if (!(TEST)) \
10591 matches = 0; \
10592 if (matches) \
10594 immtype = (CLASS); \
10595 elsize = (ELSIZE); \
10596 eshift = (SHIFT); \
10597 emvn = (NEG); \
10598 break; \
10601 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10602 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10603 unsigned char bytes[16];
10604 int immtype = -1, matches;
10605 unsigned int invmask = inverse ? 0xff : 0;
10606 int eshift, emvn;
10608 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10610 if (! (aarch64_simd_imm_zero_p (op, mode)
10611 || aarch64_vect_float_const_representable_p (op)))
10612 return false;
10614 if (info)
10616 info->value = CONST_VECTOR_ELT (op, 0);
10617 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10618 info->mvn = false;
10619 info->shift = 0;
10622 return true;
10625 /* Splat vector constant out into a byte vector. */
10626 for (i = 0; i < n_elts; i++)
10628 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10629 it must be laid out in the vector register in reverse order. */
10630 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10631 unsigned HOST_WIDE_INT elpart;
10633 gcc_assert (CONST_INT_P (el));
10634 elpart = INTVAL (el);
10636 for (unsigned int byte = 0; byte < innersize; byte++)
10638 bytes[idx++] = (elpart & 0xff) ^ invmask;
10639 elpart >>= BITS_PER_UNIT;
10644 /* Sanity check. */
10645 gcc_assert (idx == GET_MODE_SIZE (mode));
10649 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10650 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10652 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10653 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10655 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10656 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10658 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10659 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10661 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10663 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10665 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10666 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10668 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10669 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10671 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10672 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10674 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10675 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10677 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10679 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10681 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10682 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10684 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10685 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10687 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10688 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10690 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10691 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10693 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10695 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10696 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10698 while (0);
10700 if (immtype == -1)
10701 return false;
10703 if (info)
10705 info->element_width = elsize;
10706 info->mvn = emvn != 0;
10707 info->shift = eshift;
10709 unsigned HOST_WIDE_INT imm = 0;
10711 if (immtype >= 12 && immtype <= 15)
10712 info->msl = true;
10714 /* Un-invert bytes of recognized vector, if necessary. */
10715 if (invmask != 0)
10716 for (i = 0; i < idx; i++)
10717 bytes[i] ^= invmask;
10719 if (immtype == 17)
10721 /* FIXME: Broken on 32-bit H_W_I hosts. */
10722 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10724 for (i = 0; i < 8; i++)
10725 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10726 << (i * BITS_PER_UNIT);
10729 info->value = GEN_INT (imm);
10731 else
10733 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10734 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10736 /* Construct 'abcdefgh' because the assembler cannot handle
10737 generic constants. */
10738 if (info->mvn)
10739 imm = ~imm;
10740 imm = (imm >> info->shift) & 0xff;
10741 info->value = GEN_INT (imm);
10745 return true;
10746 #undef CHECK
10749 /* Check of immediate shift constants are within range. */
10750 bool
10751 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10753 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10754 if (left)
10755 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10756 else
10757 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10760 /* Return true if X is a uniform vector where all elements
10761 are either the floating-point constant 0.0 or the
10762 integer constant 0. */
10763 bool
10764 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10766 return x == CONST0_RTX (mode);
10770 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10771 operation of width WIDTH at bit position POS. */
10774 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10776 gcc_assert (CONST_INT_P (width));
10777 gcc_assert (CONST_INT_P (pos));
10779 unsigned HOST_WIDE_INT mask
10780 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10781 return GEN_INT (mask << UINTVAL (pos));
10784 bool
10785 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10787 HOST_WIDE_INT imm = INTVAL (x);
10788 int i;
10790 for (i = 0; i < 8; i++)
10792 unsigned int byte = imm & 0xff;
10793 if (byte != 0xff && byte != 0)
10794 return false;
10795 imm >>= 8;
10798 return true;
10801 bool
10802 aarch64_mov_operand_p (rtx x, machine_mode mode)
10804 if (GET_CODE (x) == HIGH
10805 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10806 return true;
10808 if (CONST_INT_P (x))
10809 return true;
10811 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10812 return true;
10814 return aarch64_classify_symbolic_expression (x)
10815 == SYMBOL_TINY_ABSOLUTE;
10818 /* Return a const_int vector of VAL. */
10820 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10822 int nunits = GET_MODE_NUNITS (mode);
10823 rtvec v = rtvec_alloc (nunits);
10824 int i;
10826 for (i=0; i < nunits; i++)
10827 RTVEC_ELT (v, i) = GEN_INT (val);
10829 return gen_rtx_CONST_VECTOR (mode, v);
10832 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10834 bool
10835 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10837 machine_mode vmode;
10839 gcc_assert (!VECTOR_MODE_P (mode));
10840 vmode = aarch64_preferred_simd_mode (mode);
10841 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10842 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10845 /* Construct and return a PARALLEL RTX vector with elements numbering the
10846 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10847 the vector - from the perspective of the architecture. This does not
10848 line up with GCC's perspective on lane numbers, so we end up with
10849 different masks depending on our target endian-ness. The diagram
10850 below may help. We must draw the distinction when building masks
10851 which select one half of the vector. An instruction selecting
10852 architectural low-lanes for a big-endian target, must be described using
10853 a mask selecting GCC high-lanes.
10855 Big-Endian Little-Endian
10857 GCC 0 1 2 3 3 2 1 0
10858 | x | x | x | x | | x | x | x | x |
10859 Architecture 3 2 1 0 3 2 1 0
10861 Low Mask: { 2, 3 } { 0, 1 }
10862 High Mask: { 0, 1 } { 2, 3 }
10866 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10868 int nunits = GET_MODE_NUNITS (mode);
10869 rtvec v = rtvec_alloc (nunits / 2);
10870 int high_base = nunits / 2;
10871 int low_base = 0;
10872 int base;
10873 rtx t1;
10874 int i;
10876 if (BYTES_BIG_ENDIAN)
10877 base = high ? low_base : high_base;
10878 else
10879 base = high ? high_base : low_base;
10881 for (i = 0; i < nunits / 2; i++)
10882 RTVEC_ELT (v, i) = GEN_INT (base + i);
10884 t1 = gen_rtx_PARALLEL (mode, v);
10885 return t1;
10888 /* Check OP for validity as a PARALLEL RTX vector with elements
10889 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10890 from the perspective of the architecture. See the diagram above
10891 aarch64_simd_vect_par_cnst_half for more details. */
10893 bool
10894 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10895 bool high)
10897 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10898 HOST_WIDE_INT count_op = XVECLEN (op, 0);
10899 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10900 int i = 0;
10902 if (!VECTOR_MODE_P (mode))
10903 return false;
10905 if (count_op != count_ideal)
10906 return false;
10908 for (i = 0; i < count_ideal; i++)
10910 rtx elt_op = XVECEXP (op, 0, i);
10911 rtx elt_ideal = XVECEXP (ideal, 0, i);
10913 if (!CONST_INT_P (elt_op)
10914 || INTVAL (elt_ideal) != INTVAL (elt_op))
10915 return false;
10917 return true;
10920 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
10921 HIGH (exclusive). */
10922 void
10923 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10924 const_tree exp)
10926 HOST_WIDE_INT lane;
10927 gcc_assert (CONST_INT_P (operand));
10928 lane = INTVAL (operand);
10930 if (lane < low || lane >= high)
10932 if (exp)
10933 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10934 else
10935 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10939 /* Return TRUE if OP is a valid vector addressing mode. */
10940 bool
10941 aarch64_simd_mem_operand_p (rtx op)
10943 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10944 || REG_P (XEXP (op, 0)));
10947 /* Emit a register copy from operand to operand, taking care not to
10948 early-clobber source registers in the process.
10950 COUNT is the number of components into which the copy needs to be
10951 decomposed. */
10952 void
10953 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10954 unsigned int count)
10956 unsigned int i;
10957 int rdest = REGNO (operands[0]);
10958 int rsrc = REGNO (operands[1]);
10960 if (!reg_overlap_mentioned_p (operands[0], operands[1])
10961 || rdest < rsrc)
10962 for (i = 0; i < count; i++)
10963 emit_move_insn (gen_rtx_REG (mode, rdest + i),
10964 gen_rtx_REG (mode, rsrc + i));
10965 else
10966 for (i = 0; i < count; i++)
10967 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10968 gen_rtx_REG (mode, rsrc + count - i - 1));
10971 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10972 one of VSTRUCT modes: OI, CI, or XI. */
10974 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10976 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10979 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
10980 alignment of a vector to 128 bits. */
10981 static HOST_WIDE_INT
10982 aarch64_simd_vector_alignment (const_tree type)
10984 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10985 return MIN (align, 128);
10988 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
10989 static bool
10990 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10992 if (is_packed)
10993 return false;
10995 /* We guarantee alignment for vectors up to 128-bits. */
10996 if (tree_int_cst_compare (TYPE_SIZE (type),
10997 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10998 return false;
11000 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11001 return true;
11004 /* If VALS is a vector constant that can be loaded into a register
11005 using DUP, generate instructions to do so and return an RTX to
11006 assign to the register. Otherwise return NULL_RTX. */
11007 static rtx
11008 aarch64_simd_dup_constant (rtx vals)
11010 machine_mode mode = GET_MODE (vals);
11011 machine_mode inner_mode = GET_MODE_INNER (mode);
11012 rtx x;
11014 if (!const_vec_duplicate_p (vals, &x))
11015 return NULL_RTX;
11017 /* We can load this constant by using DUP and a constant in a
11018 single ARM register. This will be cheaper than a vector
11019 load. */
11020 x = copy_to_mode_reg (inner_mode, x);
11021 return gen_rtx_VEC_DUPLICATE (mode, x);
11025 /* Generate code to load VALS, which is a PARALLEL containing only
11026 constants (for vec_init) or CONST_VECTOR, efficiently into a
11027 register. Returns an RTX to copy into the register, or NULL_RTX
11028 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11029 static rtx
11030 aarch64_simd_make_constant (rtx vals)
11032 machine_mode mode = GET_MODE (vals);
11033 rtx const_dup;
11034 rtx const_vec = NULL_RTX;
11035 int n_elts = GET_MODE_NUNITS (mode);
11036 int n_const = 0;
11037 int i;
11039 if (GET_CODE (vals) == CONST_VECTOR)
11040 const_vec = vals;
11041 else if (GET_CODE (vals) == PARALLEL)
11043 /* A CONST_VECTOR must contain only CONST_INTs and
11044 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11045 Only store valid constants in a CONST_VECTOR. */
11046 for (i = 0; i < n_elts; ++i)
11048 rtx x = XVECEXP (vals, 0, i);
11049 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11050 n_const++;
11052 if (n_const == n_elts)
11053 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11055 else
11056 gcc_unreachable ();
11058 if (const_vec != NULL_RTX
11059 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11060 /* Load using MOVI/MVNI. */
11061 return const_vec;
11062 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11063 /* Loaded using DUP. */
11064 return const_dup;
11065 else if (const_vec != NULL_RTX)
11066 /* Load from constant pool. We can not take advantage of single-cycle
11067 LD1 because we need a PC-relative addressing mode. */
11068 return const_vec;
11069 else
11070 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11071 We can not construct an initializer. */
11072 return NULL_RTX;
11075 /* Expand a vector initialisation sequence, such that TARGET is
11076 initialised to contain VALS. */
11078 void
11079 aarch64_expand_vector_init (rtx target, rtx vals)
11081 machine_mode mode = GET_MODE (target);
11082 machine_mode inner_mode = GET_MODE_INNER (mode);
11083 /* The number of vector elements. */
11084 int n_elts = GET_MODE_NUNITS (mode);
11085 /* The number of vector elements which are not constant. */
11086 int n_var = 0;
11087 rtx any_const = NULL_RTX;
11088 /* The first element of vals. */
11089 rtx v0 = XVECEXP (vals, 0, 0);
11090 bool all_same = true;
11092 /* Count the number of variable elements to initialise. */
11093 for (int i = 0; i < n_elts; ++i)
11095 rtx x = XVECEXP (vals, 0, i);
11096 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11097 ++n_var;
11098 else
11099 any_const = x;
11101 all_same &= rtx_equal_p (x, v0);
11104 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11105 how best to handle this. */
11106 if (n_var == 0)
11108 rtx constant = aarch64_simd_make_constant (vals);
11109 if (constant != NULL_RTX)
11111 emit_move_insn (target, constant);
11112 return;
11116 /* Splat a single non-constant element if we can. */
11117 if (all_same)
11119 rtx x = copy_to_mode_reg (inner_mode, v0);
11120 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11121 return;
11124 /* Initialise a vector which is part-variable. We want to first try
11125 to build those lanes which are constant in the most efficient way we
11126 can. */
11127 if (n_var != n_elts)
11129 rtx copy = copy_rtx (vals);
11131 /* Load constant part of vector. We really don't care what goes into the
11132 parts we will overwrite, but we're more likely to be able to load the
11133 constant efficiently if it has fewer, larger, repeating parts
11134 (see aarch64_simd_valid_immediate). */
11135 for (int i = 0; i < n_elts; i++)
11137 rtx x = XVECEXP (vals, 0, i);
11138 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11139 continue;
11140 rtx subst = any_const;
11141 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11143 /* Look in the copied vector, as more elements are const. */
11144 rtx test = XVECEXP (copy, 0, i ^ bit);
11145 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11147 subst = test;
11148 break;
11151 XVECEXP (copy, 0, i) = subst;
11153 aarch64_expand_vector_init (target, copy);
11156 /* Insert the variable lanes directly. */
11158 enum insn_code icode = optab_handler (vec_set_optab, mode);
11159 gcc_assert (icode != CODE_FOR_nothing);
11161 for (int i = 0; i < n_elts; i++)
11163 rtx x = XVECEXP (vals, 0, i);
11164 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11165 continue;
11166 x = copy_to_mode_reg (inner_mode, x);
11167 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11171 static unsigned HOST_WIDE_INT
11172 aarch64_shift_truncation_mask (machine_mode mode)
11174 return
11175 (!SHIFT_COUNT_TRUNCATED
11176 || aarch64_vector_mode_supported_p (mode)
11177 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11180 /* Select a format to encode pointers in exception handling data. */
11182 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11184 int type;
11185 switch (aarch64_cmodel)
11187 case AARCH64_CMODEL_TINY:
11188 case AARCH64_CMODEL_TINY_PIC:
11189 case AARCH64_CMODEL_SMALL:
11190 case AARCH64_CMODEL_SMALL_PIC:
11191 case AARCH64_CMODEL_SMALL_SPIC:
11192 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11193 for everything. */
11194 type = DW_EH_PE_sdata4;
11195 break;
11196 default:
11197 /* No assumptions here. 8-byte relocs required. */
11198 type = DW_EH_PE_sdata8;
11199 break;
11201 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11204 /* The last .arch and .tune assembly strings that we printed. */
11205 static std::string aarch64_last_printed_arch_string;
11206 static std::string aarch64_last_printed_tune_string;
11208 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11209 by the function fndecl. */
11211 void
11212 aarch64_declare_function_name (FILE *stream, const char* name,
11213 tree fndecl)
11215 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11217 struct cl_target_option *targ_options;
11218 if (target_parts)
11219 targ_options = TREE_TARGET_OPTION (target_parts);
11220 else
11221 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11222 gcc_assert (targ_options);
11224 const struct processor *this_arch
11225 = aarch64_get_arch (targ_options->x_explicit_arch);
11227 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11228 std::string extension
11229 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11230 this_arch->flags);
11231 /* Only update the assembler .arch string if it is distinct from the last
11232 such string we printed. */
11233 std::string to_print = this_arch->name + extension;
11234 if (to_print != aarch64_last_printed_arch_string)
11236 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11237 aarch64_last_printed_arch_string = to_print;
11240 /* Print the cpu name we're tuning for in the comments, might be
11241 useful to readers of the generated asm. Do it only when it changes
11242 from function to function and verbose assembly is requested. */
11243 const struct processor *this_tune
11244 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11246 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11248 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11249 this_tune->name);
11250 aarch64_last_printed_tune_string = this_tune->name;
11253 /* Don't forget the type directive for ELF. */
11254 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11255 ASM_OUTPUT_LABEL (stream, name);
11258 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11260 static void
11261 aarch64_start_file (void)
11263 struct cl_target_option *default_options
11264 = TREE_TARGET_OPTION (target_option_default_node);
11266 const struct processor *default_arch
11267 = aarch64_get_arch (default_options->x_explicit_arch);
11268 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11269 std::string extension
11270 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11271 default_arch->flags);
11273 aarch64_last_printed_arch_string = default_arch->name + extension;
11274 aarch64_last_printed_tune_string = "";
11275 asm_fprintf (asm_out_file, "\t.arch %s\n",
11276 aarch64_last_printed_arch_string.c_str ());
11278 default_file_start ();
11281 /* Emit load exclusive. */
11283 static void
11284 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11285 rtx mem, rtx model_rtx)
11287 rtx (*gen) (rtx, rtx, rtx);
11289 switch (mode)
11291 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11292 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11293 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11294 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11295 default:
11296 gcc_unreachable ();
11299 emit_insn (gen (rval, mem, model_rtx));
11302 /* Emit store exclusive. */
11304 static void
11305 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11306 rtx rval, rtx mem, rtx model_rtx)
11308 rtx (*gen) (rtx, rtx, rtx, rtx);
11310 switch (mode)
11312 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11313 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11314 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11315 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11316 default:
11317 gcc_unreachable ();
11320 emit_insn (gen (bval, rval, mem, model_rtx));
11323 /* Mark the previous jump instruction as unlikely. */
11325 static void
11326 aarch64_emit_unlikely_jump (rtx insn)
11328 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11330 insn = emit_jump_insn (insn);
11331 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11334 /* Expand a compare and swap pattern. */
11336 void
11337 aarch64_expand_compare_and_swap (rtx operands[])
11339 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11340 machine_mode mode, cmp_mode;
11341 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11342 int idx;
11343 gen_cas_fn gen;
11344 const gen_cas_fn split_cas[] =
11346 gen_aarch64_compare_and_swapqi,
11347 gen_aarch64_compare_and_swaphi,
11348 gen_aarch64_compare_and_swapsi,
11349 gen_aarch64_compare_and_swapdi
11351 const gen_cas_fn atomic_cas[] =
11353 gen_aarch64_compare_and_swapqi_lse,
11354 gen_aarch64_compare_and_swaphi_lse,
11355 gen_aarch64_compare_and_swapsi_lse,
11356 gen_aarch64_compare_and_swapdi_lse
11359 bval = operands[0];
11360 rval = operands[1];
11361 mem = operands[2];
11362 oldval = operands[3];
11363 newval = operands[4];
11364 is_weak = operands[5];
11365 mod_s = operands[6];
11366 mod_f = operands[7];
11367 mode = GET_MODE (mem);
11368 cmp_mode = mode;
11370 /* Normally the succ memory model must be stronger than fail, but in the
11371 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11372 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11374 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11375 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11376 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11378 switch (mode)
11380 case QImode:
11381 case HImode:
11382 /* For short modes, we're going to perform the comparison in SImode,
11383 so do the zero-extension now. */
11384 cmp_mode = SImode;
11385 rval = gen_reg_rtx (SImode);
11386 oldval = convert_modes (SImode, mode, oldval, true);
11387 /* Fall through. */
11389 case SImode:
11390 case DImode:
11391 /* Force the value into a register if needed. */
11392 if (!aarch64_plus_operand (oldval, mode))
11393 oldval = force_reg (cmp_mode, oldval);
11394 break;
11396 default:
11397 gcc_unreachable ();
11400 switch (mode)
11402 case QImode: idx = 0; break;
11403 case HImode: idx = 1; break;
11404 case SImode: idx = 2; break;
11405 case DImode: idx = 3; break;
11406 default:
11407 gcc_unreachable ();
11409 if (TARGET_LSE)
11410 gen = atomic_cas[idx];
11411 else
11412 gen = split_cas[idx];
11414 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11416 if (mode == QImode || mode == HImode)
11417 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11419 x = gen_rtx_REG (CCmode, CC_REGNUM);
11420 x = gen_rtx_EQ (SImode, x, const0_rtx);
11421 emit_insn (gen_rtx_SET (bval, x));
11424 /* Test whether the target supports using a atomic load-operate instruction.
11425 CODE is the operation and AFTER is TRUE if the data in memory after the
11426 operation should be returned and FALSE if the data before the operation
11427 should be returned. Returns FALSE if the operation isn't supported by the
11428 architecture. */
11430 bool
11431 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11433 if (!TARGET_LSE)
11434 return false;
11436 switch (code)
11438 case SET:
11439 case AND:
11440 case IOR:
11441 case XOR:
11442 case MINUS:
11443 case PLUS:
11444 return true;
11445 default:
11446 return false;
11450 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11451 sequence implementing an atomic operation. */
11453 static void
11454 aarch64_emit_post_barrier (enum memmodel model)
11456 const enum memmodel base_model = memmodel_base (model);
11458 if (is_mm_sync (model)
11459 && (base_model == MEMMODEL_ACQUIRE
11460 || base_model == MEMMODEL_ACQ_REL
11461 || base_model == MEMMODEL_SEQ_CST))
11463 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11467 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11468 for the data in memory. EXPECTED is the value expected to be in memory.
11469 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11470 is the memory ordering to use. */
11472 void
11473 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11474 rtx expected, rtx desired,
11475 rtx model)
11477 rtx (*gen) (rtx, rtx, rtx, rtx);
11478 machine_mode mode;
11480 mode = GET_MODE (mem);
11482 switch (mode)
11484 case QImode: gen = gen_aarch64_atomic_casqi; break;
11485 case HImode: gen = gen_aarch64_atomic_cashi; break;
11486 case SImode: gen = gen_aarch64_atomic_cassi; break;
11487 case DImode: gen = gen_aarch64_atomic_casdi; break;
11488 default:
11489 gcc_unreachable ();
11492 /* Move the expected value into the CAS destination register. */
11493 emit_insn (gen_rtx_SET (rval, expected));
11495 /* Emit the CAS. */
11496 emit_insn (gen (rval, mem, desired, model));
11498 /* Compare the expected value with the value loaded by the CAS, to establish
11499 whether the swap was made. */
11500 aarch64_gen_compare_reg (EQ, rval, expected);
11503 /* Split a compare and swap pattern. */
11505 void
11506 aarch64_split_compare_and_swap (rtx operands[])
11508 rtx rval, mem, oldval, newval, scratch;
11509 machine_mode mode;
11510 bool is_weak;
11511 rtx_code_label *label1, *label2;
11512 rtx x, cond;
11513 enum memmodel model;
11514 rtx model_rtx;
11516 rval = operands[0];
11517 mem = operands[1];
11518 oldval = operands[2];
11519 newval = operands[3];
11520 is_weak = (operands[4] != const0_rtx);
11521 model_rtx = operands[5];
11522 scratch = operands[7];
11523 mode = GET_MODE (mem);
11524 model = memmodel_from_int (INTVAL (model_rtx));
11526 label1 = NULL;
11527 if (!is_weak)
11529 label1 = gen_label_rtx ();
11530 emit_label (label1);
11532 label2 = gen_label_rtx ();
11534 /* The initial load can be relaxed for a __sync operation since a final
11535 barrier will be emitted to stop code hoisting. */
11536 if (is_mm_sync (model))
11537 aarch64_emit_load_exclusive (mode, rval, mem,
11538 GEN_INT (MEMMODEL_RELAXED));
11539 else
11540 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11542 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11543 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11544 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11545 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11546 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11548 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11550 if (!is_weak)
11552 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11553 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11554 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11555 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11557 else
11559 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11560 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11561 emit_insn (gen_rtx_SET (cond, x));
11564 emit_label (label2);
11566 /* Emit any final barrier needed for a __sync operation. */
11567 if (is_mm_sync (model))
11568 aarch64_emit_post_barrier (model);
11571 /* Emit a BIC instruction. */
11573 static void
11574 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11576 rtx shift_rtx = GEN_INT (shift);
11577 rtx (*gen) (rtx, rtx, rtx, rtx);
11579 switch (mode)
11581 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11582 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11583 default:
11584 gcc_unreachable ();
11587 emit_insn (gen (dst, s2, shift_rtx, s1));
11590 /* Emit an atomic swap. */
11592 static void
11593 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11594 rtx mem, rtx model)
11596 rtx (*gen) (rtx, rtx, rtx, rtx);
11598 switch (mode)
11600 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11601 case HImode: gen = gen_aarch64_atomic_swphi; break;
11602 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11603 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11604 default:
11605 gcc_unreachable ();
11608 emit_insn (gen (dst, mem, value, model));
11611 /* Operations supported by aarch64_emit_atomic_load_op. */
11613 enum aarch64_atomic_load_op_code
11615 AARCH64_LDOP_PLUS, /* A + B */
11616 AARCH64_LDOP_XOR, /* A ^ B */
11617 AARCH64_LDOP_OR, /* A | B */
11618 AARCH64_LDOP_BIC /* A & ~B */
11621 /* Emit an atomic load-operate. */
11623 static void
11624 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11625 machine_mode mode, rtx dst, rtx src,
11626 rtx mem, rtx model)
11628 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11629 const aarch64_atomic_load_op_fn plus[] =
11631 gen_aarch64_atomic_loadaddqi,
11632 gen_aarch64_atomic_loadaddhi,
11633 gen_aarch64_atomic_loadaddsi,
11634 gen_aarch64_atomic_loadadddi
11636 const aarch64_atomic_load_op_fn eor[] =
11638 gen_aarch64_atomic_loadeorqi,
11639 gen_aarch64_atomic_loadeorhi,
11640 gen_aarch64_atomic_loadeorsi,
11641 gen_aarch64_atomic_loadeordi
11643 const aarch64_atomic_load_op_fn ior[] =
11645 gen_aarch64_atomic_loadsetqi,
11646 gen_aarch64_atomic_loadsethi,
11647 gen_aarch64_atomic_loadsetsi,
11648 gen_aarch64_atomic_loadsetdi
11650 const aarch64_atomic_load_op_fn bic[] =
11652 gen_aarch64_atomic_loadclrqi,
11653 gen_aarch64_atomic_loadclrhi,
11654 gen_aarch64_atomic_loadclrsi,
11655 gen_aarch64_atomic_loadclrdi
11657 aarch64_atomic_load_op_fn gen;
11658 int idx = 0;
11660 switch (mode)
11662 case QImode: idx = 0; break;
11663 case HImode: idx = 1; break;
11664 case SImode: idx = 2; break;
11665 case DImode: idx = 3; break;
11666 default:
11667 gcc_unreachable ();
11670 switch (code)
11672 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11673 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11674 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11675 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11676 default:
11677 gcc_unreachable ();
11680 emit_insn (gen (dst, mem, src, model));
11683 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11684 location to store the data read from memory. OUT_RESULT is the location to
11685 store the result of the operation. MEM is the memory location to read and
11686 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11687 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11688 be NULL. */
11690 void
11691 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11692 rtx mem, rtx value, rtx model_rtx)
11694 machine_mode mode = GET_MODE (mem);
11695 machine_mode wmode = (mode == DImode ? DImode : SImode);
11696 const bool short_mode = (mode < SImode);
11697 aarch64_atomic_load_op_code ldop_code;
11698 rtx src;
11699 rtx x;
11701 if (out_data)
11702 out_data = gen_lowpart (mode, out_data);
11704 if (out_result)
11705 out_result = gen_lowpart (mode, out_result);
11707 /* Make sure the value is in a register, putting it into a destination
11708 register if it needs to be manipulated. */
11709 if (!register_operand (value, mode)
11710 || code == AND || code == MINUS)
11712 src = out_result ? out_result : out_data;
11713 emit_move_insn (src, gen_lowpart (mode, value));
11715 else
11716 src = value;
11717 gcc_assert (register_operand (src, mode));
11719 /* Preprocess the data for the operation as necessary. If the operation is
11720 a SET then emit a swap instruction and finish. */
11721 switch (code)
11723 case SET:
11724 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11725 return;
11727 case MINUS:
11728 /* Negate the value and treat it as a PLUS. */
11730 rtx neg_src;
11732 /* Resize the value if necessary. */
11733 if (short_mode)
11734 src = gen_lowpart (wmode, src);
11736 neg_src = gen_rtx_NEG (wmode, src);
11737 emit_insn (gen_rtx_SET (src, neg_src));
11739 if (short_mode)
11740 src = gen_lowpart (mode, src);
11742 /* Fall-through. */
11743 case PLUS:
11744 ldop_code = AARCH64_LDOP_PLUS;
11745 break;
11747 case IOR:
11748 ldop_code = AARCH64_LDOP_OR;
11749 break;
11751 case XOR:
11752 ldop_code = AARCH64_LDOP_XOR;
11753 break;
11755 case AND:
11757 rtx not_src;
11759 /* Resize the value if necessary. */
11760 if (short_mode)
11761 src = gen_lowpart (wmode, src);
11763 not_src = gen_rtx_NOT (wmode, src);
11764 emit_insn (gen_rtx_SET (src, not_src));
11766 if (short_mode)
11767 src = gen_lowpart (mode, src);
11769 ldop_code = AARCH64_LDOP_BIC;
11770 break;
11772 default:
11773 /* The operation can't be done with atomic instructions. */
11774 gcc_unreachable ();
11777 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11779 /* If necessary, calculate the data in memory after the update by redoing the
11780 operation from values in registers. */
11781 if (!out_result)
11782 return;
11784 if (short_mode)
11786 src = gen_lowpart (wmode, src);
11787 out_data = gen_lowpart (wmode, out_data);
11788 out_result = gen_lowpart (wmode, out_result);
11791 x = NULL_RTX;
11793 switch (code)
11795 case MINUS:
11796 case PLUS:
11797 x = gen_rtx_PLUS (wmode, out_data, src);
11798 break;
11799 case IOR:
11800 x = gen_rtx_IOR (wmode, out_data, src);
11801 break;
11802 case XOR:
11803 x = gen_rtx_XOR (wmode, out_data, src);
11804 break;
11805 case AND:
11806 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11807 return;
11808 default:
11809 gcc_unreachable ();
11812 emit_set_insn (out_result, x);
11814 return;
11817 /* Split an atomic operation. */
11819 void
11820 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11821 rtx value, rtx model_rtx, rtx cond)
11823 machine_mode mode = GET_MODE (mem);
11824 machine_mode wmode = (mode == DImode ? DImode : SImode);
11825 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11826 const bool is_sync = is_mm_sync (model);
11827 rtx_code_label *label;
11828 rtx x;
11830 /* Split the atomic operation into a sequence. */
11831 label = gen_label_rtx ();
11832 emit_label (label);
11834 if (new_out)
11835 new_out = gen_lowpart (wmode, new_out);
11836 if (old_out)
11837 old_out = gen_lowpart (wmode, old_out);
11838 else
11839 old_out = new_out;
11840 value = simplify_gen_subreg (wmode, value, mode, 0);
11842 /* The initial load can be relaxed for a __sync operation since a final
11843 barrier will be emitted to stop code hoisting. */
11844 if (is_sync)
11845 aarch64_emit_load_exclusive (mode, old_out, mem,
11846 GEN_INT (MEMMODEL_RELAXED));
11847 else
11848 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11850 switch (code)
11852 case SET:
11853 new_out = value;
11854 break;
11856 case NOT:
11857 x = gen_rtx_AND (wmode, old_out, value);
11858 emit_insn (gen_rtx_SET (new_out, x));
11859 x = gen_rtx_NOT (wmode, new_out);
11860 emit_insn (gen_rtx_SET (new_out, x));
11861 break;
11863 case MINUS:
11864 if (CONST_INT_P (value))
11866 value = GEN_INT (-INTVAL (value));
11867 code = PLUS;
11869 /* Fall through. */
11871 default:
11872 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11873 emit_insn (gen_rtx_SET (new_out, x));
11874 break;
11877 aarch64_emit_store_exclusive (mode, cond, mem,
11878 gen_lowpart (mode, new_out), model_rtx);
11880 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11881 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11882 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11883 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11885 /* Emit any final barrier needed for a __sync operation. */
11886 if (is_sync)
11887 aarch64_emit_post_barrier (model);
11890 static void
11891 aarch64_init_libfuncs (void)
11893 /* Half-precision float operations. The compiler handles all operations
11894 with NULL libfuncs by converting to SFmode. */
11896 /* Conversions. */
11897 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11898 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11900 /* Arithmetic. */
11901 set_optab_libfunc (add_optab, HFmode, NULL);
11902 set_optab_libfunc (sdiv_optab, HFmode, NULL);
11903 set_optab_libfunc (smul_optab, HFmode, NULL);
11904 set_optab_libfunc (neg_optab, HFmode, NULL);
11905 set_optab_libfunc (sub_optab, HFmode, NULL);
11907 /* Comparisons. */
11908 set_optab_libfunc (eq_optab, HFmode, NULL);
11909 set_optab_libfunc (ne_optab, HFmode, NULL);
11910 set_optab_libfunc (lt_optab, HFmode, NULL);
11911 set_optab_libfunc (le_optab, HFmode, NULL);
11912 set_optab_libfunc (ge_optab, HFmode, NULL);
11913 set_optab_libfunc (gt_optab, HFmode, NULL);
11914 set_optab_libfunc (unord_optab, HFmode, NULL);
11917 /* Target hook for c_mode_for_suffix. */
11918 static machine_mode
11919 aarch64_c_mode_for_suffix (char suffix)
11921 if (suffix == 'q')
11922 return TFmode;
11924 return VOIDmode;
11927 /* We can only represent floating point constants which will fit in
11928 "quarter-precision" values. These values are characterised by
11929 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
11932 (-1)^s * (n/16) * 2^r
11934 Where:
11935 's' is the sign bit.
11936 'n' is an integer in the range 16 <= n <= 31.
11937 'r' is an integer in the range -3 <= r <= 4. */
11939 /* Return true iff X can be represented by a quarter-precision
11940 floating point immediate operand X. Note, we cannot represent 0.0. */
11941 bool
11942 aarch64_float_const_representable_p (rtx x)
11944 /* This represents our current view of how many bits
11945 make up the mantissa. */
11946 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11947 int exponent;
11948 unsigned HOST_WIDE_INT mantissa, mask;
11949 REAL_VALUE_TYPE r, m;
11950 bool fail;
11952 if (!CONST_DOUBLE_P (x))
11953 return false;
11955 /* We don't support HFmode constants yet. */
11956 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11957 return false;
11959 r = *CONST_DOUBLE_REAL_VALUE (x);
11961 /* We cannot represent infinities, NaNs or +/-zero. We won't
11962 know if we have +zero until we analyse the mantissa, but we
11963 can reject the other invalid values. */
11964 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11965 || REAL_VALUE_MINUS_ZERO (r))
11966 return false;
11968 /* Extract exponent. */
11969 r = real_value_abs (&r);
11970 exponent = REAL_EXP (&r);
11972 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11973 highest (sign) bit, with a fixed binary point at bit point_pos.
11974 m1 holds the low part of the mantissa, m2 the high part.
11975 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11976 bits for the mantissa, this can fail (low bits will be lost). */
11977 real_ldexp (&m, &r, point_pos - exponent);
11978 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11980 /* If the low part of the mantissa has bits set we cannot represent
11981 the value. */
11982 if (w.elt (0) != 0)
11983 return false;
11984 /* We have rejected the lower HOST_WIDE_INT, so update our
11985 understanding of how many bits lie in the mantissa and
11986 look only at the high HOST_WIDE_INT. */
11987 mantissa = w.elt (1);
11988 point_pos -= HOST_BITS_PER_WIDE_INT;
11990 /* We can only represent values with a mantissa of the form 1.xxxx. */
11991 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11992 if ((mantissa & mask) != 0)
11993 return false;
11995 /* Having filtered unrepresentable values, we may now remove all
11996 but the highest 5 bits. */
11997 mantissa >>= point_pos - 5;
11999 /* We cannot represent the value 0.0, so reject it. This is handled
12000 elsewhere. */
12001 if (mantissa == 0)
12002 return false;
12004 /* Then, as bit 4 is always set, we can mask it off, leaving
12005 the mantissa in the range [0, 15]. */
12006 mantissa &= ~(1 << 4);
12007 gcc_assert (mantissa <= 15);
12009 /* GCC internally does not use IEEE754-like encoding (where normalized
12010 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12011 Our mantissa values are shifted 4 places to the left relative to
12012 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12013 by 5 places to correct for GCC's representation. */
12014 exponent = 5 - exponent;
12016 return (exponent >= 0 && exponent <= 7);
12019 char*
12020 aarch64_output_simd_mov_immediate (rtx const_vector,
12021 machine_mode mode,
12022 unsigned width)
12024 bool is_valid;
12025 static char templ[40];
12026 const char *mnemonic;
12027 const char *shift_op;
12028 unsigned int lane_count = 0;
12029 char element_char;
12031 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12033 /* This will return true to show const_vector is legal for use as either
12034 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12035 also update INFO to show how the immediate should be generated. */
12036 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12037 gcc_assert (is_valid);
12039 element_char = sizetochar (info.element_width);
12040 lane_count = width / info.element_width;
12042 mode = GET_MODE_INNER (mode);
12043 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12045 gcc_assert (info.shift == 0 && ! info.mvn);
12046 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12047 move immediate path. */
12048 if (aarch64_float_const_zero_rtx_p (info.value))
12049 info.value = GEN_INT (0);
12050 else
12052 const unsigned int buf_size = 20;
12053 char float_buf[buf_size] = {'\0'};
12054 real_to_decimal_for_mode (float_buf,
12055 CONST_DOUBLE_REAL_VALUE (info.value),
12056 buf_size, buf_size, 1, mode);
12058 if (lane_count == 1)
12059 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12060 else
12061 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12062 lane_count, element_char, float_buf);
12063 return templ;
12067 mnemonic = info.mvn ? "mvni" : "movi";
12068 shift_op = info.msl ? "msl" : "lsl";
12070 gcc_assert (CONST_INT_P (info.value));
12071 if (lane_count == 1)
12072 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12073 mnemonic, UINTVAL (info.value));
12074 else if (info.shift)
12075 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12076 ", %s %d", mnemonic, lane_count, element_char,
12077 UINTVAL (info.value), shift_op, info.shift);
12078 else
12079 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12080 mnemonic, lane_count, element_char, UINTVAL (info.value));
12081 return templ;
12084 char*
12085 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12086 machine_mode mode)
12088 machine_mode vmode;
12090 gcc_assert (!VECTOR_MODE_P (mode));
12091 vmode = aarch64_simd_container_mode (mode, 64);
12092 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12093 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12096 /* Split operands into moves from op[1] + op[2] into op[0]. */
12098 void
12099 aarch64_split_combinev16qi (rtx operands[3])
12101 unsigned int dest = REGNO (operands[0]);
12102 unsigned int src1 = REGNO (operands[1]);
12103 unsigned int src2 = REGNO (operands[2]);
12104 machine_mode halfmode = GET_MODE (operands[1]);
12105 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12106 rtx destlo, desthi;
12108 gcc_assert (halfmode == V16QImode);
12110 if (src1 == dest && src2 == dest + halfregs)
12112 /* No-op move. Can't split to nothing; emit something. */
12113 emit_note (NOTE_INSN_DELETED);
12114 return;
12117 /* Preserve register attributes for variable tracking. */
12118 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12119 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12120 GET_MODE_SIZE (halfmode));
12122 /* Special case of reversed high/low parts. */
12123 if (reg_overlap_mentioned_p (operands[2], destlo)
12124 && reg_overlap_mentioned_p (operands[1], desthi))
12126 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12127 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12128 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12130 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12132 /* Try to avoid unnecessary moves if part of the result
12133 is in the right place already. */
12134 if (src1 != dest)
12135 emit_move_insn (destlo, operands[1]);
12136 if (src2 != dest + halfregs)
12137 emit_move_insn (desthi, operands[2]);
12139 else
12141 if (src2 != dest + halfregs)
12142 emit_move_insn (desthi, operands[2]);
12143 if (src1 != dest)
12144 emit_move_insn (destlo, operands[1]);
12148 /* vec_perm support. */
12150 #define MAX_VECT_LEN 16
12152 struct expand_vec_perm_d
12154 rtx target, op0, op1;
12155 unsigned char perm[MAX_VECT_LEN];
12156 machine_mode vmode;
12157 unsigned char nelt;
12158 bool one_vector_p;
12159 bool testing_p;
12162 /* Generate a variable permutation. */
12164 static void
12165 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12167 machine_mode vmode = GET_MODE (target);
12168 bool one_vector_p = rtx_equal_p (op0, op1);
12170 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12171 gcc_checking_assert (GET_MODE (op0) == vmode);
12172 gcc_checking_assert (GET_MODE (op1) == vmode);
12173 gcc_checking_assert (GET_MODE (sel) == vmode);
12174 gcc_checking_assert (TARGET_SIMD);
12176 if (one_vector_p)
12178 if (vmode == V8QImode)
12180 /* Expand the argument to a V16QI mode by duplicating it. */
12181 rtx pair = gen_reg_rtx (V16QImode);
12182 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12183 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12185 else
12187 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12190 else
12192 rtx pair;
12194 if (vmode == V8QImode)
12196 pair = gen_reg_rtx (V16QImode);
12197 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12198 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12200 else
12202 pair = gen_reg_rtx (OImode);
12203 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12204 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12209 void
12210 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12212 machine_mode vmode = GET_MODE (target);
12213 unsigned int nelt = GET_MODE_NUNITS (vmode);
12214 bool one_vector_p = rtx_equal_p (op0, op1);
12215 rtx mask;
12217 /* The TBL instruction does not use a modulo index, so we must take care
12218 of that ourselves. */
12219 mask = aarch64_simd_gen_const_vector_dup (vmode,
12220 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12221 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12223 /* For big-endian, we also need to reverse the index within the vector
12224 (but not which vector). */
12225 if (BYTES_BIG_ENDIAN)
12227 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12228 if (!one_vector_p)
12229 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12230 sel = expand_simple_binop (vmode, XOR, sel, mask,
12231 NULL, 0, OPTAB_LIB_WIDEN);
12233 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12236 /* Recognize patterns suitable for the TRN instructions. */
12237 static bool
12238 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12240 unsigned int i, odd, mask, nelt = d->nelt;
12241 rtx out, in0, in1, x;
12242 rtx (*gen) (rtx, rtx, rtx);
12243 machine_mode vmode = d->vmode;
12245 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12246 return false;
12248 /* Note that these are little-endian tests.
12249 We correct for big-endian later. */
12250 if (d->perm[0] == 0)
12251 odd = 0;
12252 else if (d->perm[0] == 1)
12253 odd = 1;
12254 else
12255 return false;
12256 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12258 for (i = 0; i < nelt; i += 2)
12260 if (d->perm[i] != i + odd)
12261 return false;
12262 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12263 return false;
12266 /* Success! */
12267 if (d->testing_p)
12268 return true;
12270 in0 = d->op0;
12271 in1 = d->op1;
12272 if (BYTES_BIG_ENDIAN)
12274 x = in0, in0 = in1, in1 = x;
12275 odd = !odd;
12277 out = d->target;
12279 if (odd)
12281 switch (vmode)
12283 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12284 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12285 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12286 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12287 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12288 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12289 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12290 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12291 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12292 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12293 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12294 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12295 default:
12296 return false;
12299 else
12301 switch (vmode)
12303 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12304 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12305 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12306 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12307 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12308 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12309 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12310 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12311 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12312 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12313 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12314 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12315 default:
12316 return false;
12320 emit_insn (gen (out, in0, in1));
12321 return true;
12324 /* Recognize patterns suitable for the UZP instructions. */
12325 static bool
12326 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12328 unsigned int i, odd, mask, nelt = d->nelt;
12329 rtx out, in0, in1, x;
12330 rtx (*gen) (rtx, rtx, rtx);
12331 machine_mode vmode = d->vmode;
12333 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12334 return false;
12336 /* Note that these are little-endian tests.
12337 We correct for big-endian later. */
12338 if (d->perm[0] == 0)
12339 odd = 0;
12340 else if (d->perm[0] == 1)
12341 odd = 1;
12342 else
12343 return false;
12344 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12346 for (i = 0; i < nelt; i++)
12348 unsigned elt = (i * 2 + odd) & mask;
12349 if (d->perm[i] != elt)
12350 return false;
12353 /* Success! */
12354 if (d->testing_p)
12355 return true;
12357 in0 = d->op0;
12358 in1 = d->op1;
12359 if (BYTES_BIG_ENDIAN)
12361 x = in0, in0 = in1, in1 = x;
12362 odd = !odd;
12364 out = d->target;
12366 if (odd)
12368 switch (vmode)
12370 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12371 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12372 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12373 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12374 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12375 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12376 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12377 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12378 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12379 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12380 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12381 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12382 default:
12383 return false;
12386 else
12388 switch (vmode)
12390 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12391 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12392 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12393 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12394 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12395 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12396 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12397 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12398 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12399 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12400 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12401 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12402 default:
12403 return false;
12407 emit_insn (gen (out, in0, in1));
12408 return true;
12411 /* Recognize patterns suitable for the ZIP instructions. */
12412 static bool
12413 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12415 unsigned int i, high, mask, nelt = d->nelt;
12416 rtx out, in0, in1, x;
12417 rtx (*gen) (rtx, rtx, rtx);
12418 machine_mode vmode = d->vmode;
12420 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12421 return false;
12423 /* Note that these are little-endian tests.
12424 We correct for big-endian later. */
12425 high = nelt / 2;
12426 if (d->perm[0] == high)
12427 /* Do Nothing. */
12429 else if (d->perm[0] == 0)
12430 high = 0;
12431 else
12432 return false;
12433 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12435 for (i = 0; i < nelt / 2; i++)
12437 unsigned elt = (i + high) & mask;
12438 if (d->perm[i * 2] != elt)
12439 return false;
12440 elt = (elt + nelt) & mask;
12441 if (d->perm[i * 2 + 1] != elt)
12442 return false;
12445 /* Success! */
12446 if (d->testing_p)
12447 return true;
12449 in0 = d->op0;
12450 in1 = d->op1;
12451 if (BYTES_BIG_ENDIAN)
12453 x = in0, in0 = in1, in1 = x;
12454 high = !high;
12456 out = d->target;
12458 if (high)
12460 switch (vmode)
12462 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12463 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12464 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12465 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12466 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12467 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12468 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12469 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12470 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12471 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12472 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12473 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12474 default:
12475 return false;
12478 else
12480 switch (vmode)
12482 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12483 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12484 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12485 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12486 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12487 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12488 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12489 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12490 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12491 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12492 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12493 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12494 default:
12495 return false;
12499 emit_insn (gen (out, in0, in1));
12500 return true;
12503 /* Recognize patterns for the EXT insn. */
12505 static bool
12506 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12508 unsigned int i, nelt = d->nelt;
12509 rtx (*gen) (rtx, rtx, rtx, rtx);
12510 rtx offset;
12512 unsigned int location = d->perm[0]; /* Always < nelt. */
12514 /* Check if the extracted indices are increasing by one. */
12515 for (i = 1; i < nelt; i++)
12517 unsigned int required = location + i;
12518 if (d->one_vector_p)
12520 /* We'll pass the same vector in twice, so allow indices to wrap. */
12521 required &= (nelt - 1);
12523 if (d->perm[i] != required)
12524 return false;
12527 switch (d->vmode)
12529 case V16QImode: gen = gen_aarch64_extv16qi; break;
12530 case V8QImode: gen = gen_aarch64_extv8qi; break;
12531 case V4HImode: gen = gen_aarch64_extv4hi; break;
12532 case V8HImode: gen = gen_aarch64_extv8hi; break;
12533 case V2SImode: gen = gen_aarch64_extv2si; break;
12534 case V4SImode: gen = gen_aarch64_extv4si; break;
12535 case V4HFmode: gen = gen_aarch64_extv4hf; break;
12536 case V8HFmode: gen = gen_aarch64_extv8hf; break;
12537 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12538 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12539 case V2DImode: gen = gen_aarch64_extv2di; break;
12540 case V2DFmode: gen = gen_aarch64_extv2df; break;
12541 default:
12542 return false;
12545 /* Success! */
12546 if (d->testing_p)
12547 return true;
12549 /* The case where (location == 0) is a no-op for both big- and little-endian,
12550 and is removed by the mid-end at optimization levels -O1 and higher. */
12552 if (BYTES_BIG_ENDIAN && (location != 0))
12554 /* After setup, we want the high elements of the first vector (stored
12555 at the LSB end of the register), and the low elements of the second
12556 vector (stored at the MSB end of the register). So swap. */
12557 std::swap (d->op0, d->op1);
12558 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12559 location = nelt - location;
12562 offset = GEN_INT (location);
12563 emit_insn (gen (d->target, d->op0, d->op1, offset));
12564 return true;
12567 /* Recognize patterns for the REV insns. */
12569 static bool
12570 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12572 unsigned int i, j, diff, nelt = d->nelt;
12573 rtx (*gen) (rtx, rtx);
12575 if (!d->one_vector_p)
12576 return false;
12578 diff = d->perm[0];
12579 switch (diff)
12581 case 7:
12582 switch (d->vmode)
12584 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12585 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12586 default:
12587 return false;
12589 break;
12590 case 3:
12591 switch (d->vmode)
12593 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12594 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12595 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12596 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12597 default:
12598 return false;
12600 break;
12601 case 1:
12602 switch (d->vmode)
12604 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12605 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12606 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12607 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12608 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12609 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12610 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12611 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12612 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
12613 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
12614 default:
12615 return false;
12617 break;
12618 default:
12619 return false;
12622 for (i = 0; i < nelt ; i += diff + 1)
12623 for (j = 0; j <= diff; j += 1)
12625 /* This is guaranteed to be true as the value of diff
12626 is 7, 3, 1 and we should have enough elements in the
12627 queue to generate this. Getting a vector mask with a
12628 value of diff other than these values implies that
12629 something is wrong by the time we get here. */
12630 gcc_assert (i + j < nelt);
12631 if (d->perm[i + j] != i + diff - j)
12632 return false;
12635 /* Success! */
12636 if (d->testing_p)
12637 return true;
12639 emit_insn (gen (d->target, d->op0));
12640 return true;
12643 static bool
12644 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12646 rtx (*gen) (rtx, rtx, rtx);
12647 rtx out = d->target;
12648 rtx in0;
12649 machine_mode vmode = d->vmode;
12650 unsigned int i, elt, nelt = d->nelt;
12651 rtx lane;
12653 elt = d->perm[0];
12654 for (i = 1; i < nelt; i++)
12656 if (elt != d->perm[i])
12657 return false;
12660 /* The generic preparation in aarch64_expand_vec_perm_const_1
12661 swaps the operand order and the permute indices if it finds
12662 d->perm[0] to be in the second operand. Thus, we can always
12663 use d->op0 and need not do any extra arithmetic to get the
12664 correct lane number. */
12665 in0 = d->op0;
12666 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12668 switch (vmode)
12670 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12671 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12672 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12673 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12674 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12675 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12676 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12677 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12678 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12679 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12680 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12681 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12682 default:
12683 return false;
12686 emit_insn (gen (out, in0, lane));
12687 return true;
12690 static bool
12691 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12693 rtx rperm[MAX_VECT_LEN], sel;
12694 machine_mode vmode = d->vmode;
12695 unsigned int i, nelt = d->nelt;
12697 if (d->testing_p)
12698 return true;
12700 /* Generic code will try constant permutation twice. Once with the
12701 original mode and again with the elements lowered to QImode.
12702 So wait and don't do the selector expansion ourselves. */
12703 if (vmode != V8QImode && vmode != V16QImode)
12704 return false;
12706 for (i = 0; i < nelt; ++i)
12708 int nunits = GET_MODE_NUNITS (vmode);
12710 /* If big-endian and two vectors we end up with a weird mixed-endian
12711 mode on NEON. Reverse the index within each word but not the word
12712 itself. */
12713 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12714 : d->perm[i]);
12716 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12717 sel = force_reg (vmode, sel);
12719 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12720 return true;
12723 static bool
12724 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12726 /* The pattern matching functions above are written to look for a small
12727 number to begin the sequence (0, 1, N/2). If we begin with an index
12728 from the second operand, we can swap the operands. */
12729 if (d->perm[0] >= d->nelt)
12731 unsigned i, nelt = d->nelt;
12733 gcc_assert (nelt == (nelt & -nelt));
12734 for (i = 0; i < nelt; ++i)
12735 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12737 std::swap (d->op0, d->op1);
12740 if (TARGET_SIMD)
12742 if (aarch64_evpc_rev (d))
12743 return true;
12744 else if (aarch64_evpc_ext (d))
12745 return true;
12746 else if (aarch64_evpc_dup (d))
12747 return true;
12748 else if (aarch64_evpc_zip (d))
12749 return true;
12750 else if (aarch64_evpc_uzp (d))
12751 return true;
12752 else if (aarch64_evpc_trn (d))
12753 return true;
12754 return aarch64_evpc_tbl (d);
12756 return false;
12759 /* Expand a vec_perm_const pattern. */
12761 bool
12762 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12764 struct expand_vec_perm_d d;
12765 int i, nelt, which;
12767 d.target = target;
12768 d.op0 = op0;
12769 d.op1 = op1;
12771 d.vmode = GET_MODE (target);
12772 gcc_assert (VECTOR_MODE_P (d.vmode));
12773 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12774 d.testing_p = false;
12776 for (i = which = 0; i < nelt; ++i)
12778 rtx e = XVECEXP (sel, 0, i);
12779 int ei = INTVAL (e) & (2 * nelt - 1);
12780 which |= (ei < nelt ? 1 : 2);
12781 d.perm[i] = ei;
12784 switch (which)
12786 default:
12787 gcc_unreachable ();
12789 case 3:
12790 d.one_vector_p = false;
12791 if (!rtx_equal_p (op0, op1))
12792 break;
12794 /* The elements of PERM do not suggest that only the first operand
12795 is used, but both operands are identical. Allow easier matching
12796 of the permutation by folding the permutation into the single
12797 input vector. */
12798 /* Fall Through. */
12799 case 2:
12800 for (i = 0; i < nelt; ++i)
12801 d.perm[i] &= nelt - 1;
12802 d.op0 = op1;
12803 d.one_vector_p = true;
12804 break;
12806 case 1:
12807 d.op1 = op0;
12808 d.one_vector_p = true;
12809 break;
12812 return aarch64_expand_vec_perm_const_1 (&d);
12815 static bool
12816 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12817 const unsigned char *sel)
12819 struct expand_vec_perm_d d;
12820 unsigned int i, nelt, which;
12821 bool ret;
12823 d.vmode = vmode;
12824 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12825 d.testing_p = true;
12826 memcpy (d.perm, sel, nelt);
12828 /* Calculate whether all elements are in one vector. */
12829 for (i = which = 0; i < nelt; ++i)
12831 unsigned char e = d.perm[i];
12832 gcc_assert (e < 2 * nelt);
12833 which |= (e < nelt ? 1 : 2);
12836 /* If all elements are from the second vector, reindex as if from the
12837 first vector. */
12838 if (which == 2)
12839 for (i = 0; i < nelt; ++i)
12840 d.perm[i] -= nelt;
12842 /* Check whether the mask can be applied to a single vector. */
12843 d.one_vector_p = (which != 3);
12845 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12846 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12847 if (!d.one_vector_p)
12848 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12850 start_sequence ();
12851 ret = aarch64_expand_vec_perm_const_1 (&d);
12852 end_sequence ();
12854 return ret;
12858 aarch64_reverse_mask (enum machine_mode mode)
12860 /* We have to reverse each vector because we dont have
12861 a permuted load that can reverse-load according to ABI rules. */
12862 rtx mask;
12863 rtvec v = rtvec_alloc (16);
12864 int i, j;
12865 int nunits = GET_MODE_NUNITS (mode);
12866 int usize = GET_MODE_UNIT_SIZE (mode);
12868 gcc_assert (BYTES_BIG_ENDIAN);
12869 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12871 for (i = 0; i < nunits; i++)
12872 for (j = 0; j < usize; j++)
12873 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12874 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12875 return force_reg (V16QImode, mask);
12878 /* Implement MODES_TIEABLE_P. In principle we should always return true.
12879 However due to issues with register allocation it is preferable to avoid
12880 tieing integer scalar and FP scalar modes. Executing integer operations
12881 in general registers is better than treating them as scalar vector
12882 operations. This reduces latency and avoids redundant int<->FP moves.
12883 So tie modes if they are either the same class, or vector modes with
12884 other vector modes, vector structs or any scalar mode.
12887 bool
12888 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12890 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12891 return true;
12893 /* We specifically want to allow elements of "structure" modes to
12894 be tieable to the structure. This more general condition allows
12895 other rarer situations too. */
12896 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
12897 return true;
12899 /* Also allow any scalar modes with vectors. */
12900 if (aarch64_vector_mode_supported_p (mode1)
12901 || aarch64_vector_mode_supported_p (mode2))
12902 return true;
12904 return false;
12907 /* Return a new RTX holding the result of moving POINTER forward by
12908 AMOUNT bytes. */
12910 static rtx
12911 aarch64_move_pointer (rtx pointer, int amount)
12913 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12915 return adjust_automodify_address (pointer, GET_MODE (pointer),
12916 next, amount);
12919 /* Return a new RTX holding the result of moving POINTER forward by the
12920 size of the mode it points to. */
12922 static rtx
12923 aarch64_progress_pointer (rtx pointer)
12925 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12927 return aarch64_move_pointer (pointer, amount);
12930 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12931 MODE bytes. */
12933 static void
12934 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12935 machine_mode mode)
12937 rtx reg = gen_reg_rtx (mode);
12939 /* "Cast" the pointers to the correct mode. */
12940 *src = adjust_address (*src, mode, 0);
12941 *dst = adjust_address (*dst, mode, 0);
12942 /* Emit the memcpy. */
12943 emit_move_insn (reg, *src);
12944 emit_move_insn (*dst, reg);
12945 /* Move the pointers forward. */
12946 *src = aarch64_progress_pointer (*src);
12947 *dst = aarch64_progress_pointer (*dst);
12950 /* Expand movmem, as if from a __builtin_memcpy. Return true if
12951 we succeed, otherwise return false. */
12953 bool
12954 aarch64_expand_movmem (rtx *operands)
12956 unsigned int n;
12957 rtx dst = operands[0];
12958 rtx src = operands[1];
12959 rtx base;
12960 bool speed_p = !optimize_function_for_size_p (cfun);
12962 /* When optimizing for size, give a better estimate of the length of a
12963 memcpy call, but use the default otherwise. */
12964 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12966 /* We can't do anything smart if the amount to copy is not constant. */
12967 if (!CONST_INT_P (operands[2]))
12968 return false;
12970 n = UINTVAL (operands[2]);
12972 /* Try to keep the number of instructions low. For cases below 16 bytes we
12973 need to make at most two moves. For cases above 16 bytes it will be one
12974 move for each 16 byte chunk, then at most two additional moves. */
12975 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12976 return false;
12978 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12979 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12981 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12982 src = adjust_automodify_address (src, VOIDmode, base, 0);
12984 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12985 1-byte chunk. */
12986 if (n < 4)
12988 if (n >= 2)
12990 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12991 n -= 2;
12994 if (n == 1)
12995 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12997 return true;
13000 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13001 4-byte chunk, partially overlapping with the previously copied chunk. */
13002 if (n < 8)
13004 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13005 n -= 4;
13006 if (n > 0)
13008 int move = n - 4;
13010 src = aarch64_move_pointer (src, move);
13011 dst = aarch64_move_pointer (dst, move);
13012 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13014 return true;
13017 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13018 them, then (if applicable) an 8-byte chunk. */
13019 while (n >= 8)
13021 if (n / 16)
13023 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13024 n -= 16;
13026 else
13028 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13029 n -= 8;
13033 /* Finish the final bytes of the copy. We can always do this in one
13034 instruction. We either copy the exact amount we need, or partially
13035 overlap with the previous chunk we copied and copy 8-bytes. */
13036 if (n == 0)
13037 return true;
13038 else if (n == 1)
13039 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13040 else if (n == 2)
13041 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13042 else if (n == 4)
13043 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13044 else
13046 if (n == 3)
13048 src = aarch64_move_pointer (src, -1);
13049 dst = aarch64_move_pointer (dst, -1);
13050 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13052 else
13054 int move = n - 8;
13056 src = aarch64_move_pointer (src, move);
13057 dst = aarch64_move_pointer (dst, move);
13058 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13062 return true;
13065 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13067 static unsigned HOST_WIDE_INT
13068 aarch64_asan_shadow_offset (void)
13070 return (HOST_WIDE_INT_1 << 36);
13073 static bool
13074 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13075 unsigned int align,
13076 enum by_pieces_operation op,
13077 bool speed_p)
13079 /* STORE_BY_PIECES can be used when copying a constant string, but
13080 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13081 For now we always fail this and let the move_by_pieces code copy
13082 the string from read-only memory. */
13083 if (op == STORE_BY_PIECES)
13084 return false;
13086 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13089 static rtx
13090 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13091 int code, tree treeop0, tree treeop1)
13093 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13094 rtx op0, op1;
13095 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13096 insn_code icode;
13097 struct expand_operand ops[4];
13099 start_sequence ();
13100 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13102 op_mode = GET_MODE (op0);
13103 if (op_mode == VOIDmode)
13104 op_mode = GET_MODE (op1);
13106 switch (op_mode)
13108 case QImode:
13109 case HImode:
13110 case SImode:
13111 cmp_mode = SImode;
13112 icode = CODE_FOR_cmpsi;
13113 break;
13115 case DImode:
13116 cmp_mode = DImode;
13117 icode = CODE_FOR_cmpdi;
13118 break;
13120 case SFmode:
13121 cmp_mode = SFmode;
13122 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13123 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13124 break;
13126 case DFmode:
13127 cmp_mode = DFmode;
13128 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13129 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13130 break;
13132 default:
13133 end_sequence ();
13134 return NULL_RTX;
13137 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13138 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13139 if (!op0 || !op1)
13141 end_sequence ();
13142 return NULL_RTX;
13144 *prep_seq = get_insns ();
13145 end_sequence ();
13147 create_fixed_operand (&ops[0], op0);
13148 create_fixed_operand (&ops[1], op1);
13150 start_sequence ();
13151 if (!maybe_expand_insn (icode, 2, ops))
13153 end_sequence ();
13154 return NULL_RTX;
13156 *gen_seq = get_insns ();
13157 end_sequence ();
13159 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13160 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13163 static rtx
13164 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13165 tree treeop0, tree treeop1, int bit_code)
13167 rtx op0, op1, target;
13168 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13169 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13170 insn_code icode;
13171 struct expand_operand ops[6];
13172 int aarch64_cond;
13174 push_to_sequence ((rtx_insn*) *prep_seq);
13175 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13177 op_mode = GET_MODE (op0);
13178 if (op_mode == VOIDmode)
13179 op_mode = GET_MODE (op1);
13181 switch (op_mode)
13183 case QImode:
13184 case HImode:
13185 case SImode:
13186 cmp_mode = SImode;
13187 icode = CODE_FOR_ccmpsi;
13188 break;
13190 case DImode:
13191 cmp_mode = DImode;
13192 icode = CODE_FOR_ccmpdi;
13193 break;
13195 case SFmode:
13196 cmp_mode = SFmode;
13197 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13198 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13199 break;
13201 case DFmode:
13202 cmp_mode = DFmode;
13203 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13204 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13205 break;
13207 default:
13208 end_sequence ();
13209 return NULL_RTX;
13212 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13213 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13214 if (!op0 || !op1)
13216 end_sequence ();
13217 return NULL_RTX;
13219 *prep_seq = get_insns ();
13220 end_sequence ();
13222 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13223 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13225 if (bit_code != AND)
13227 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13228 GET_MODE (XEXP (prev, 0))),
13229 VOIDmode, XEXP (prev, 0), const0_rtx);
13230 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13233 create_fixed_operand (&ops[0], XEXP (prev, 0));
13234 create_fixed_operand (&ops[1], target);
13235 create_fixed_operand (&ops[2], op0);
13236 create_fixed_operand (&ops[3], op1);
13237 create_fixed_operand (&ops[4], prev);
13238 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13240 push_to_sequence ((rtx_insn*) *gen_seq);
13241 if (!maybe_expand_insn (icode, 6, ops))
13243 end_sequence ();
13244 return NULL_RTX;
13247 *gen_seq = get_insns ();
13248 end_sequence ();
13250 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13253 #undef TARGET_GEN_CCMP_FIRST
13254 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13256 #undef TARGET_GEN_CCMP_NEXT
13257 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13259 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13260 instruction fusion of some sort. */
13262 static bool
13263 aarch64_macro_fusion_p (void)
13265 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13269 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13270 should be kept together during scheduling. */
13272 static bool
13273 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13275 rtx set_dest;
13276 rtx prev_set = single_set (prev);
13277 rtx curr_set = single_set (curr);
13278 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13279 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13281 if (!aarch64_macro_fusion_p ())
13282 return false;
13284 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13286 /* We are trying to match:
13287 prev (mov) == (set (reg r0) (const_int imm16))
13288 curr (movk) == (set (zero_extract (reg r0)
13289 (const_int 16)
13290 (const_int 16))
13291 (const_int imm16_1)) */
13293 set_dest = SET_DEST (curr_set);
13295 if (GET_CODE (set_dest) == ZERO_EXTRACT
13296 && CONST_INT_P (SET_SRC (curr_set))
13297 && CONST_INT_P (SET_SRC (prev_set))
13298 && CONST_INT_P (XEXP (set_dest, 2))
13299 && INTVAL (XEXP (set_dest, 2)) == 16
13300 && REG_P (XEXP (set_dest, 0))
13301 && REG_P (SET_DEST (prev_set))
13302 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13304 return true;
13308 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13311 /* We're trying to match:
13312 prev (adrp) == (set (reg r1)
13313 (high (symbol_ref ("SYM"))))
13314 curr (add) == (set (reg r0)
13315 (lo_sum (reg r1)
13316 (symbol_ref ("SYM"))))
13317 Note that r0 need not necessarily be the same as r1, especially
13318 during pre-regalloc scheduling. */
13320 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13321 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13323 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13324 && REG_P (XEXP (SET_SRC (curr_set), 0))
13325 && REGNO (XEXP (SET_SRC (curr_set), 0))
13326 == REGNO (SET_DEST (prev_set))
13327 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13328 XEXP (SET_SRC (curr_set), 1)))
13329 return true;
13333 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13336 /* We're trying to match:
13337 prev (movk) == (set (zero_extract (reg r0)
13338 (const_int 16)
13339 (const_int 32))
13340 (const_int imm16_1))
13341 curr (movk) == (set (zero_extract (reg r0)
13342 (const_int 16)
13343 (const_int 48))
13344 (const_int imm16_2)) */
13346 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13347 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13348 && REG_P (XEXP (SET_DEST (prev_set), 0))
13349 && REG_P (XEXP (SET_DEST (curr_set), 0))
13350 && REGNO (XEXP (SET_DEST (prev_set), 0))
13351 == REGNO (XEXP (SET_DEST (curr_set), 0))
13352 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13353 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13354 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13355 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13356 && CONST_INT_P (SET_SRC (prev_set))
13357 && CONST_INT_P (SET_SRC (curr_set)))
13358 return true;
13361 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13363 /* We're trying to match:
13364 prev (adrp) == (set (reg r0)
13365 (high (symbol_ref ("SYM"))))
13366 curr (ldr) == (set (reg r1)
13367 (mem (lo_sum (reg r0)
13368 (symbol_ref ("SYM")))))
13370 curr (ldr) == (set (reg r1)
13371 (zero_extend (mem
13372 (lo_sum (reg r0)
13373 (symbol_ref ("SYM")))))) */
13374 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13375 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13377 rtx curr_src = SET_SRC (curr_set);
13379 if (GET_CODE (curr_src) == ZERO_EXTEND)
13380 curr_src = XEXP (curr_src, 0);
13382 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13383 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13384 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13385 == REGNO (SET_DEST (prev_set))
13386 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13387 XEXP (SET_SRC (prev_set), 0)))
13388 return true;
13392 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13393 && aarch_crypto_can_dual_issue (prev, curr))
13394 return true;
13396 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13397 && any_condjump_p (curr))
13399 enum attr_type prev_type = get_attr_type (prev);
13401 /* FIXME: this misses some which is considered simple arthematic
13402 instructions for ThunderX. Simple shifts are missed here. */
13403 if (prev_type == TYPE_ALUS_SREG
13404 || prev_type == TYPE_ALUS_IMM
13405 || prev_type == TYPE_LOGICS_REG
13406 || prev_type == TYPE_LOGICS_IMM)
13407 return true;
13410 return false;
13413 /* Return true iff the instruction fusion described by OP is enabled. */
13415 bool
13416 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13418 return (aarch64_tune_params.fusible_ops & op) != 0;
13421 /* If MEM is in the form of [base+offset], extract the two parts
13422 of address and set to BASE and OFFSET, otherwise return false
13423 after clearing BASE and OFFSET. */
13425 bool
13426 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13428 rtx addr;
13430 gcc_assert (MEM_P (mem));
13432 addr = XEXP (mem, 0);
13434 if (REG_P (addr))
13436 *base = addr;
13437 *offset = const0_rtx;
13438 return true;
13441 if (GET_CODE (addr) == PLUS
13442 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13444 *base = XEXP (addr, 0);
13445 *offset = XEXP (addr, 1);
13446 return true;
13449 *base = NULL_RTX;
13450 *offset = NULL_RTX;
13452 return false;
13455 /* Types for scheduling fusion. */
13456 enum sched_fusion_type
13458 SCHED_FUSION_NONE = 0,
13459 SCHED_FUSION_LD_SIGN_EXTEND,
13460 SCHED_FUSION_LD_ZERO_EXTEND,
13461 SCHED_FUSION_LD,
13462 SCHED_FUSION_ST,
13463 SCHED_FUSION_NUM
13466 /* If INSN is a load or store of address in the form of [base+offset],
13467 extract the two parts and set to BASE and OFFSET. Return scheduling
13468 fusion type this INSN is. */
13470 static enum sched_fusion_type
13471 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13473 rtx x, dest, src;
13474 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13476 gcc_assert (INSN_P (insn));
13477 x = PATTERN (insn);
13478 if (GET_CODE (x) != SET)
13479 return SCHED_FUSION_NONE;
13481 src = SET_SRC (x);
13482 dest = SET_DEST (x);
13484 machine_mode dest_mode = GET_MODE (dest);
13486 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13487 return SCHED_FUSION_NONE;
13489 if (GET_CODE (src) == SIGN_EXTEND)
13491 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13492 src = XEXP (src, 0);
13493 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13494 return SCHED_FUSION_NONE;
13496 else if (GET_CODE (src) == ZERO_EXTEND)
13498 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13499 src = XEXP (src, 0);
13500 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13501 return SCHED_FUSION_NONE;
13504 if (GET_CODE (src) == MEM && REG_P (dest))
13505 extract_base_offset_in_addr (src, base, offset);
13506 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13508 fusion = SCHED_FUSION_ST;
13509 extract_base_offset_in_addr (dest, base, offset);
13511 else
13512 return SCHED_FUSION_NONE;
13514 if (*base == NULL_RTX || *offset == NULL_RTX)
13515 fusion = SCHED_FUSION_NONE;
13517 return fusion;
13520 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13522 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13523 and PRI are only calculated for these instructions. For other instruction,
13524 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13525 type instruction fusion can be added by returning different priorities.
13527 It's important that irrelevant instructions get the largest FUSION_PRI. */
13529 static void
13530 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13531 int *fusion_pri, int *pri)
13533 int tmp, off_val;
13534 rtx base, offset;
13535 enum sched_fusion_type fusion;
13537 gcc_assert (INSN_P (insn));
13539 tmp = max_pri - 1;
13540 fusion = fusion_load_store (insn, &base, &offset);
13541 if (fusion == SCHED_FUSION_NONE)
13543 *pri = tmp;
13544 *fusion_pri = tmp;
13545 return;
13548 /* Set FUSION_PRI according to fusion type and base register. */
13549 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13551 /* Calculate PRI. */
13552 tmp /= 2;
13554 /* INSN with smaller offset goes first. */
13555 off_val = (int)(INTVAL (offset));
13556 if (off_val >= 0)
13557 tmp -= (off_val & 0xfffff);
13558 else
13559 tmp += ((- off_val) & 0xfffff);
13561 *pri = tmp;
13562 return;
13565 /* Given OPERANDS of consecutive load/store, check if we can merge
13566 them into ldp/stp. LOAD is true if they are load instructions.
13567 MODE is the mode of memory operands. */
13569 bool
13570 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13571 enum machine_mode mode)
13573 HOST_WIDE_INT offval_1, offval_2, msize;
13574 enum reg_class rclass_1, rclass_2;
13575 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13577 if (load)
13579 mem_1 = operands[1];
13580 mem_2 = operands[3];
13581 reg_1 = operands[0];
13582 reg_2 = operands[2];
13583 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13584 if (REGNO (reg_1) == REGNO (reg_2))
13585 return false;
13587 else
13589 mem_1 = operands[0];
13590 mem_2 = operands[2];
13591 reg_1 = operands[1];
13592 reg_2 = operands[3];
13595 /* The mems cannot be volatile. */
13596 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13597 return false;
13599 /* Check if the addresses are in the form of [base+offset]. */
13600 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13601 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13602 return false;
13603 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13604 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13605 return false;
13607 /* Check if the bases are same. */
13608 if (!rtx_equal_p (base_1, base_2))
13609 return false;
13611 offval_1 = INTVAL (offset_1);
13612 offval_2 = INTVAL (offset_2);
13613 msize = GET_MODE_SIZE (mode);
13614 /* Check if the offsets are consecutive. */
13615 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13616 return false;
13618 /* Check if the addresses are clobbered by load. */
13619 if (load)
13621 if (reg_mentioned_p (reg_1, mem_1))
13622 return false;
13624 /* In increasing order, the last load can clobber the address. */
13625 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13626 return false;
13629 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13630 rclass_1 = FP_REGS;
13631 else
13632 rclass_1 = GENERAL_REGS;
13634 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13635 rclass_2 = FP_REGS;
13636 else
13637 rclass_2 = GENERAL_REGS;
13639 /* Check if the registers are of same class. */
13640 if (rclass_1 != rclass_2)
13641 return false;
13643 return true;
13646 /* Given OPERANDS of consecutive load/store, check if we can merge
13647 them into ldp/stp by adjusting the offset. LOAD is true if they
13648 are load instructions. MODE is the mode of memory operands.
13650 Given below consecutive stores:
13652 str w1, [xb, 0x100]
13653 str w1, [xb, 0x104]
13654 str w1, [xb, 0x108]
13655 str w1, [xb, 0x10c]
13657 Though the offsets are out of the range supported by stp, we can
13658 still pair them after adjusting the offset, like:
13660 add scratch, xb, 0x100
13661 stp w1, w1, [scratch]
13662 stp w1, w1, [scratch, 0x8]
13664 The peephole patterns detecting this opportunity should guarantee
13665 the scratch register is avaliable. */
13667 bool
13668 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13669 enum machine_mode mode)
13671 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13672 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13673 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13674 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13676 if (load)
13678 reg_1 = operands[0];
13679 mem_1 = operands[1];
13680 reg_2 = operands[2];
13681 mem_2 = operands[3];
13682 reg_3 = operands[4];
13683 mem_3 = operands[5];
13684 reg_4 = operands[6];
13685 mem_4 = operands[7];
13686 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13687 && REG_P (reg_3) && REG_P (reg_4));
13688 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13689 return false;
13691 else
13693 mem_1 = operands[0];
13694 reg_1 = operands[1];
13695 mem_2 = operands[2];
13696 reg_2 = operands[3];
13697 mem_3 = operands[4];
13698 reg_3 = operands[5];
13699 mem_4 = operands[6];
13700 reg_4 = operands[7];
13702 /* Skip if memory operand is by itslef valid for ldp/stp. */
13703 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13704 return false;
13706 /* The mems cannot be volatile. */
13707 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13708 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13709 return false;
13711 /* Check if the addresses are in the form of [base+offset]. */
13712 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13713 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13714 return false;
13715 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13716 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13717 return false;
13718 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13719 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13720 return false;
13721 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13722 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13723 return false;
13725 /* Check if the bases are same. */
13726 if (!rtx_equal_p (base_1, base_2)
13727 || !rtx_equal_p (base_2, base_3)
13728 || !rtx_equal_p (base_3, base_4))
13729 return false;
13731 offval_1 = INTVAL (offset_1);
13732 offval_2 = INTVAL (offset_2);
13733 offval_3 = INTVAL (offset_3);
13734 offval_4 = INTVAL (offset_4);
13735 msize = GET_MODE_SIZE (mode);
13736 /* Check if the offsets are consecutive. */
13737 if ((offval_1 != (offval_2 + msize)
13738 || offval_1 != (offval_3 + msize * 2)
13739 || offval_1 != (offval_4 + msize * 3))
13740 && (offval_4 != (offval_3 + msize)
13741 || offval_4 != (offval_2 + msize * 2)
13742 || offval_4 != (offval_1 + msize * 3)))
13743 return false;
13745 /* Check if the addresses are clobbered by load. */
13746 if (load)
13748 if (reg_mentioned_p (reg_1, mem_1)
13749 || reg_mentioned_p (reg_2, mem_2)
13750 || reg_mentioned_p (reg_3, mem_3))
13751 return false;
13753 /* In increasing order, the last load can clobber the address. */
13754 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13755 return false;
13758 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13759 rclass_1 = FP_REGS;
13760 else
13761 rclass_1 = GENERAL_REGS;
13763 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13764 rclass_2 = FP_REGS;
13765 else
13766 rclass_2 = GENERAL_REGS;
13768 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13769 rclass_3 = FP_REGS;
13770 else
13771 rclass_3 = GENERAL_REGS;
13773 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13774 rclass_4 = FP_REGS;
13775 else
13776 rclass_4 = GENERAL_REGS;
13778 /* Check if the registers are of same class. */
13779 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13780 return false;
13782 return true;
13785 /* Given OPERANDS of consecutive load/store, this function pairs them
13786 into ldp/stp after adjusting the offset. It depends on the fact
13787 that addresses of load/store instructions are in increasing order.
13788 MODE is the mode of memory operands. CODE is the rtl operator
13789 which should be applied to all memory operands, it's SIGN_EXTEND,
13790 ZERO_EXTEND or UNKNOWN. */
13792 bool
13793 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13794 enum machine_mode mode, RTX_CODE code)
13796 rtx base, offset, t1, t2;
13797 rtx mem_1, mem_2, mem_3, mem_4;
13798 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13800 if (load)
13802 mem_1 = operands[1];
13803 mem_2 = operands[3];
13804 mem_3 = operands[5];
13805 mem_4 = operands[7];
13807 else
13809 mem_1 = operands[0];
13810 mem_2 = operands[2];
13811 mem_3 = operands[4];
13812 mem_4 = operands[6];
13813 gcc_assert (code == UNKNOWN);
13816 extract_base_offset_in_addr (mem_1, &base, &offset);
13817 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13819 /* Adjust offset thus it can fit in ldp/stp instruction. */
13820 msize = GET_MODE_SIZE (mode);
13821 stp_off_limit = msize * 0x40;
13822 off_val = INTVAL (offset);
13823 abs_off = (off_val < 0) ? -off_val : off_val;
13824 new_off = abs_off % stp_off_limit;
13825 adj_off = abs_off - new_off;
13827 /* Further adjust to make sure all offsets are OK. */
13828 if ((new_off + msize * 2) >= stp_off_limit)
13830 adj_off += stp_off_limit;
13831 new_off -= stp_off_limit;
13834 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13835 if (adj_off >= 0x1000)
13836 return false;
13838 if (off_val < 0)
13840 adj_off = -adj_off;
13841 new_off = -new_off;
13844 /* Create new memory references. */
13845 mem_1 = change_address (mem_1, VOIDmode,
13846 plus_constant (DImode, operands[8], new_off));
13848 /* Check if the adjusted address is OK for ldp/stp. */
13849 if (!aarch64_mem_pair_operand (mem_1, mode))
13850 return false;
13852 msize = GET_MODE_SIZE (mode);
13853 mem_2 = change_address (mem_2, VOIDmode,
13854 plus_constant (DImode,
13855 operands[8],
13856 new_off + msize));
13857 mem_3 = change_address (mem_3, VOIDmode,
13858 plus_constant (DImode,
13859 operands[8],
13860 new_off + msize * 2));
13861 mem_4 = change_address (mem_4, VOIDmode,
13862 plus_constant (DImode,
13863 operands[8],
13864 new_off + msize * 3));
13866 if (code == ZERO_EXTEND)
13868 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13869 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13870 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13871 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13873 else if (code == SIGN_EXTEND)
13875 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13876 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13877 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13878 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13881 if (load)
13883 operands[1] = mem_1;
13884 operands[3] = mem_2;
13885 operands[5] = mem_3;
13886 operands[7] = mem_4;
13888 else
13890 operands[0] = mem_1;
13891 operands[2] = mem_2;
13892 operands[4] = mem_3;
13893 operands[6] = mem_4;
13896 /* Emit adjusting instruction. */
13897 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13898 /* Emit ldp/stp instructions. */
13899 t1 = gen_rtx_SET (operands[0], operands[1]);
13900 t2 = gen_rtx_SET (operands[2], operands[3]);
13901 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13902 t1 = gen_rtx_SET (operands[4], operands[5]);
13903 t2 = gen_rtx_SET (operands[6], operands[7]);
13904 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13905 return true;
13908 /* Return 1 if pseudo register should be created and used to hold
13909 GOT address for PIC code. */
13911 bool
13912 aarch64_use_pseudo_pic_reg (void)
13914 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13917 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
13919 static int
13920 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13922 switch (XINT (x, 1))
13924 case UNSPEC_GOTSMALLPIC:
13925 case UNSPEC_GOTSMALLPIC28K:
13926 case UNSPEC_GOTTINYPIC:
13927 return 0;
13928 default:
13929 break;
13932 return default_unspec_may_trap_p (x, flags);
13936 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13937 return the log2 of that value. Otherwise return -1. */
13940 aarch64_fpconst_pow_of_2 (rtx x)
13942 const REAL_VALUE_TYPE *r;
13944 if (!CONST_DOUBLE_P (x))
13945 return -1;
13947 r = CONST_DOUBLE_REAL_VALUE (x);
13949 if (REAL_VALUE_NEGATIVE (*r)
13950 || REAL_VALUE_ISNAN (*r)
13951 || REAL_VALUE_ISINF (*r)
13952 || !real_isinteger (r, DFmode))
13953 return -1;
13955 return exact_log2 (real_to_integer (r));
13958 /* If X is a vector of equal CONST_DOUBLE values and that value is
13959 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
13962 aarch64_vec_fpconst_pow_of_2 (rtx x)
13964 if (GET_CODE (x) != CONST_VECTOR)
13965 return -1;
13967 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13968 return -1;
13970 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13971 if (firstval <= 0)
13972 return -1;
13974 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13975 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13976 return -1;
13978 return firstval;
13981 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
13982 static tree
13983 aarch64_promoted_type (const_tree t)
13985 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13986 return float_type_node;
13987 return NULL_TREE;
13990 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
13992 static bool
13993 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
13994 optimization_type opt_type)
13996 switch (op)
13998 case rsqrt_optab:
13999 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14001 default:
14002 return true;
14006 #undef TARGET_ADDRESS_COST
14007 #define TARGET_ADDRESS_COST aarch64_address_cost
14009 /* This hook will determines whether unnamed bitfields affect the alignment
14010 of the containing structure. The hook returns true if the structure
14011 should inherit the alignment requirements of an unnamed bitfield's
14012 type. */
14013 #undef TARGET_ALIGN_ANON_BITFIELD
14014 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14016 #undef TARGET_ASM_ALIGNED_DI_OP
14017 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14019 #undef TARGET_ASM_ALIGNED_HI_OP
14020 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14022 #undef TARGET_ASM_ALIGNED_SI_OP
14023 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14025 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14026 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14027 hook_bool_const_tree_hwi_hwi_const_tree_true
14029 #undef TARGET_ASM_FILE_START
14030 #define TARGET_ASM_FILE_START aarch64_start_file
14032 #undef TARGET_ASM_OUTPUT_MI_THUNK
14033 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14035 #undef TARGET_ASM_SELECT_RTX_SECTION
14036 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14038 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14039 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14041 #undef TARGET_BUILD_BUILTIN_VA_LIST
14042 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14044 #undef TARGET_CALLEE_COPIES
14045 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14047 #undef TARGET_CAN_ELIMINATE
14048 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14050 #undef TARGET_CAN_INLINE_P
14051 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14053 #undef TARGET_CANNOT_FORCE_CONST_MEM
14054 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14056 #undef TARGET_CASE_VALUES_THRESHOLD
14057 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14059 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14060 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14062 /* Only the least significant bit is used for initialization guard
14063 variables. */
14064 #undef TARGET_CXX_GUARD_MASK_BIT
14065 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14067 #undef TARGET_C_MODE_FOR_SUFFIX
14068 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14070 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14071 #undef TARGET_DEFAULT_TARGET_FLAGS
14072 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14073 #endif
14075 #undef TARGET_CLASS_MAX_NREGS
14076 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14078 #undef TARGET_BUILTIN_DECL
14079 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14081 #undef TARGET_BUILTIN_RECIPROCAL
14082 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14084 #undef TARGET_EXPAND_BUILTIN
14085 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14087 #undef TARGET_EXPAND_BUILTIN_VA_START
14088 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14090 #undef TARGET_FOLD_BUILTIN
14091 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14093 #undef TARGET_FUNCTION_ARG
14094 #define TARGET_FUNCTION_ARG aarch64_function_arg
14096 #undef TARGET_FUNCTION_ARG_ADVANCE
14097 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14099 #undef TARGET_FUNCTION_ARG_BOUNDARY
14100 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14102 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14103 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14105 #undef TARGET_FUNCTION_VALUE
14106 #define TARGET_FUNCTION_VALUE aarch64_function_value
14108 #undef TARGET_FUNCTION_VALUE_REGNO_P
14109 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14111 #undef TARGET_FRAME_POINTER_REQUIRED
14112 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14114 #undef TARGET_GIMPLE_FOLD_BUILTIN
14115 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14117 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14118 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14120 #undef TARGET_INIT_BUILTINS
14121 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14123 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14124 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14125 aarch64_ira_change_pseudo_allocno_class
14127 #undef TARGET_LEGITIMATE_ADDRESS_P
14128 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14130 #undef TARGET_LEGITIMATE_CONSTANT_P
14131 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14133 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14134 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14136 #undef TARGET_LRA_P
14137 #define TARGET_LRA_P hook_bool_void_true
14139 #undef TARGET_MANGLE_TYPE
14140 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14142 #undef TARGET_MEMORY_MOVE_COST
14143 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14145 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14146 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14148 #undef TARGET_MUST_PASS_IN_STACK
14149 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14151 /* This target hook should return true if accesses to volatile bitfields
14152 should use the narrowest mode possible. It should return false if these
14153 accesses should use the bitfield container type. */
14154 #undef TARGET_NARROW_VOLATILE_BITFIELD
14155 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14157 #undef TARGET_OPTION_OVERRIDE
14158 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14160 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14161 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14162 aarch64_override_options_after_change
14164 #undef TARGET_OPTION_SAVE
14165 #define TARGET_OPTION_SAVE aarch64_option_save
14167 #undef TARGET_OPTION_RESTORE
14168 #define TARGET_OPTION_RESTORE aarch64_option_restore
14170 #undef TARGET_OPTION_PRINT
14171 #define TARGET_OPTION_PRINT aarch64_option_print
14173 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14174 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14176 #undef TARGET_SET_CURRENT_FUNCTION
14177 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14179 #undef TARGET_PASS_BY_REFERENCE
14180 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14182 #undef TARGET_PREFERRED_RELOAD_CLASS
14183 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14185 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14186 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14188 #undef TARGET_PROMOTED_TYPE
14189 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14191 #undef TARGET_SECONDARY_RELOAD
14192 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14194 #undef TARGET_SHIFT_TRUNCATION_MASK
14195 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14197 #undef TARGET_SETUP_INCOMING_VARARGS
14198 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14200 #undef TARGET_STRUCT_VALUE_RTX
14201 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14203 #undef TARGET_REGISTER_MOVE_COST
14204 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14206 #undef TARGET_RETURN_IN_MEMORY
14207 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14209 #undef TARGET_RETURN_IN_MSB
14210 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14212 #undef TARGET_RTX_COSTS
14213 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14215 #undef TARGET_SCHED_ISSUE_RATE
14216 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14218 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14219 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14220 aarch64_sched_first_cycle_multipass_dfa_lookahead
14222 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14223 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14224 aarch64_first_cycle_multipass_dfa_lookahead_guard
14226 #undef TARGET_TRAMPOLINE_INIT
14227 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14229 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14230 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14232 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14233 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14235 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14236 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14238 #undef TARGET_VECTORIZE_ADD_STMT_COST
14239 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14241 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14242 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14243 aarch64_builtin_vectorization_cost
14245 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14246 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14248 #undef TARGET_VECTORIZE_BUILTINS
14249 #define TARGET_VECTORIZE_BUILTINS
14251 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14252 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14253 aarch64_builtin_vectorized_function
14255 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14256 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14257 aarch64_autovectorize_vector_sizes
14259 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14260 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14261 aarch64_atomic_assign_expand_fenv
14263 /* Section anchor support. */
14265 #undef TARGET_MIN_ANCHOR_OFFSET
14266 #define TARGET_MIN_ANCHOR_OFFSET -256
14268 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14269 byte offset; we can do much more for larger data types, but have no way
14270 to determine the size of the access. We assume accesses are aligned. */
14271 #undef TARGET_MAX_ANCHOR_OFFSET
14272 #define TARGET_MAX_ANCHOR_OFFSET 4095
14274 #undef TARGET_VECTOR_ALIGNMENT
14275 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14277 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14278 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14279 aarch64_simd_vector_alignment_reachable
14281 /* vec_perm support. */
14283 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14284 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14285 aarch64_vectorize_vec_perm_const_ok
14287 #undef TARGET_INIT_LIBFUNCS
14288 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14290 #undef TARGET_FIXED_CONDITION_CODE_REGS
14291 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14293 #undef TARGET_FLAGS_REGNUM
14294 #define TARGET_FLAGS_REGNUM CC_REGNUM
14296 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14297 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14299 #undef TARGET_ASAN_SHADOW_OFFSET
14300 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14302 #undef TARGET_LEGITIMIZE_ADDRESS
14303 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14305 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14306 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14307 aarch64_use_by_pieces_infrastructure_p
14309 #undef TARGET_CAN_USE_DOLOOP_P
14310 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14312 #undef TARGET_SCHED_MACRO_FUSION_P
14313 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14315 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14316 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14318 #undef TARGET_SCHED_FUSION_PRIORITY
14319 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14321 #undef TARGET_UNSPEC_MAY_TRAP_P
14322 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14324 #undef TARGET_USE_PSEUDO_PIC_REG
14325 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14327 #undef TARGET_PRINT_OPERAND
14328 #define TARGET_PRINT_OPERAND aarch64_print_operand
14330 #undef TARGET_PRINT_OPERAND_ADDRESS
14331 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14333 #undef TARGET_OPTAB_SUPPORTED_P
14334 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14336 #undef TARGET_OMIT_STRUCT_RETURN_REG
14337 #define TARGET_OMIT_STRUCT_RETURN_REG true
14339 struct gcc_target targetm = TARGET_INITIALIZER;
14341 #include "gt-aarch64.h"