1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_STRING
24 #include "coretypes.h"
35 #include "stringpool.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
52 #include "langhooks.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
79 A simple base register plus immediate offset.
82 A base register indexed by immediate offset with writeback.
85 A base register indexed by (optionally scaled) register.
88 A base register indexed by (optionally scaled) zero-extended register.
91 A base register indexed by (optionally scaled) sign-extended register.
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type
{
109 struct aarch64_address_info
{
110 enum aarch64_address_type type
;
114 enum aarch64_symbol_type symbol_type
;
117 struct simd_immediate_info
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel
;
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
134 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
137 machine_mode
*, int *,
139 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
140 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode
);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
144 const unsigned char *sel
);
145 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
151 aarch64_simd_container_mode (machine_mode mode
, unsigned width
);
153 /* Major revision number of the ARM Architecture implemented by the target. */
154 unsigned aarch64_architecture_version
;
156 /* The processor for which instructions should be scheduled. */
157 enum aarch64_processor aarch64_tune
= cortexa53
;
159 /* Mask to specify which instruction scheduling options should be used. */
160 unsigned long aarch64_tune_flags
= 0;
162 /* Global flag for PC relative loads. */
163 bool aarch64_pcrelative_literal_loads
;
165 /* Support for command line parsing of boolean flags in the tuning
167 struct aarch64_flag_desc
173 #define AARCH64_FUSION_PAIR(name, internal_name) \
174 { name, AARCH64_FUSE_##internal_name },
175 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
177 { "none", AARCH64_FUSE_NOTHING
},
178 #include "aarch64-fusion-pairs.def"
179 { "all", AARCH64_FUSE_ALL
},
180 { NULL
, AARCH64_FUSE_NOTHING
}
183 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
184 { name, AARCH64_EXTRA_TUNE_##internal_name },
185 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
187 { "none", AARCH64_EXTRA_TUNE_NONE
},
188 #include "aarch64-tuning-flags.def"
189 { "all", AARCH64_EXTRA_TUNE_ALL
},
190 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
193 /* Tuning parameters. */
195 static const struct cpu_addrcost_table generic_addrcost_table
=
205 0, /* register_offset */
206 0, /* register_sextend */
207 0, /* register_zextend */
211 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
221 1, /* register_offset */
222 1, /* register_sextend */
223 2, /* register_zextend */
227 static const struct cpu_addrcost_table xgene1_addrcost_table
=
237 0, /* register_offset */
238 1, /* register_sextend */
239 1, /* register_zextend */
243 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
253 2, /* register_offset */
254 3, /* register_sextend */
255 3, /* register_zextend */
259 static const struct cpu_regmove_cost generic_regmove_cost
=
262 /* Avoid the use of slow int<->fp moves for spilling by setting
263 their cost higher than memmov_cost. */
269 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
272 /* Avoid the use of slow int<->fp moves for spilling by setting
273 their cost higher than memmov_cost. */
279 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
282 /* Avoid the use of slow int<->fp moves for spilling by setting
283 their cost higher than memmov_cost. */
289 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost (actual, 4 and 9). */
299 static const struct cpu_regmove_cost thunderx_regmove_cost
=
307 static const struct cpu_regmove_cost xgene1_regmove_cost
=
310 /* Avoid the use of slow int<->fp moves for spilling by setting
311 their cost higher than memmov_cost. */
317 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
320 /* Avoid the use of int<->fp moves for spilling. */
326 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
329 /* Avoid the use of int<->fp moves for spilling. */
335 /* Generic costs for vector insn classes. */
336 static const struct cpu_vector_cost generic_vector_cost
=
338 1, /* scalar_int_stmt_cost */
339 1, /* scalar_fp_stmt_cost */
340 1, /* scalar_load_cost */
341 1, /* scalar_store_cost */
342 1, /* vec_int_stmt_cost */
343 1, /* vec_fp_stmt_cost */
344 2, /* vec_permute_cost */
345 1, /* vec_to_scalar_cost */
346 1, /* scalar_to_vec_cost */
347 1, /* vec_align_load_cost */
348 1, /* vec_unalign_load_cost */
349 1, /* vec_unalign_store_cost */
350 1, /* vec_store_cost */
351 3, /* cond_taken_branch_cost */
352 1 /* cond_not_taken_branch_cost */
355 /* ThunderX costs for vector insn classes. */
356 static const struct cpu_vector_cost thunderx_vector_cost
=
358 1, /* scalar_int_stmt_cost */
359 1, /* scalar_fp_stmt_cost */
360 3, /* scalar_load_cost */
361 1, /* scalar_store_cost */
362 4, /* vec_int_stmt_cost */
363 1, /* vec_fp_stmt_cost */
364 4, /* vec_permute_cost */
365 2, /* vec_to_scalar_cost */
366 2, /* scalar_to_vec_cost */
367 3, /* vec_align_load_cost */
368 5, /* vec_unalign_load_cost */
369 5, /* vec_unalign_store_cost */
370 1, /* vec_store_cost */
371 3, /* cond_taken_branch_cost */
372 3 /* cond_not_taken_branch_cost */
375 /* Generic costs for vector insn classes. */
376 static const struct cpu_vector_cost cortexa57_vector_cost
=
378 1, /* scalar_int_stmt_cost */
379 1, /* scalar_fp_stmt_cost */
380 4, /* scalar_load_cost */
381 1, /* scalar_store_cost */
382 2, /* vec_int_stmt_cost */
383 2, /* vec_fp_stmt_cost */
384 3, /* vec_permute_cost */
385 8, /* vec_to_scalar_cost */
386 8, /* scalar_to_vec_cost */
387 4, /* vec_align_load_cost */
388 4, /* vec_unalign_load_cost */
389 1, /* vec_unalign_store_cost */
390 1, /* vec_store_cost */
391 1, /* cond_taken_branch_cost */
392 1 /* cond_not_taken_branch_cost */
395 static const struct cpu_vector_cost exynosm1_vector_cost
=
397 1, /* scalar_int_stmt_cost */
398 1, /* scalar_fp_stmt_cost */
399 5, /* scalar_load_cost */
400 1, /* scalar_store_cost */
401 3, /* vec_int_stmt_cost */
402 3, /* vec_fp_stmt_cost */
403 3, /* vec_permute_cost */
404 3, /* vec_to_scalar_cost */
405 3, /* scalar_to_vec_cost */
406 5, /* vec_align_load_cost */
407 5, /* vec_unalign_load_cost */
408 1, /* vec_unalign_store_cost */
409 1, /* vec_store_cost */
410 1, /* cond_taken_branch_cost */
411 1 /* cond_not_taken_branch_cost */
414 /* Generic costs for vector insn classes. */
415 static const struct cpu_vector_cost xgene1_vector_cost
=
417 1, /* scalar_int_stmt_cost */
418 1, /* scalar_fp_stmt_cost */
419 5, /* scalar_load_cost */
420 1, /* scalar_store_cost */
421 2, /* vec_int_stmt_cost */
422 2, /* vec_fp_stmt_cost */
423 2, /* vec_permute_cost */
424 4, /* vec_to_scalar_cost */
425 4, /* scalar_to_vec_cost */
426 10, /* vec_align_load_cost */
427 10, /* vec_unalign_load_cost */
428 2, /* vec_unalign_store_cost */
429 2, /* vec_store_cost */
430 2, /* cond_taken_branch_cost */
431 1 /* cond_not_taken_branch_cost */
434 /* Costs for vector insn classes for Vulcan. */
435 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
437 1, /* scalar_int_stmt_cost */
438 6, /* scalar_fp_stmt_cost */
439 4, /* scalar_load_cost */
440 1, /* scalar_store_cost */
441 5, /* vec_int_stmt_cost */
442 6, /* vec_fp_stmt_cost */
443 3, /* vec_permute_cost */
444 6, /* vec_to_scalar_cost */
445 5, /* scalar_to_vec_cost */
446 8, /* vec_align_load_cost */
447 8, /* vec_unalign_load_cost */
448 4, /* vec_unalign_store_cost */
449 4, /* vec_store_cost */
450 2, /* cond_taken_branch_cost */
451 1 /* cond_not_taken_branch_cost */
454 /* Generic costs for branch instructions. */
455 static const struct cpu_branch_cost generic_branch_cost
=
457 1, /* Predictable. */
458 3 /* Unpredictable. */
461 /* Generic approximation modes. */
462 static const cpu_approx_modes generic_approx_modes
=
464 AARCH64_APPROX_NONE
, /* division */
465 AARCH64_APPROX_NONE
, /* sqrt */
466 AARCH64_APPROX_NONE
/* recip_sqrt */
469 /* Approximation modes for Exynos M1. */
470 static const cpu_approx_modes exynosm1_approx_modes
=
472 AARCH64_APPROX_NONE
, /* division */
473 AARCH64_APPROX_ALL
, /* sqrt */
474 AARCH64_APPROX_ALL
/* recip_sqrt */
477 /* Approximation modes for X-Gene 1. */
478 static const cpu_approx_modes xgene1_approx_modes
=
480 AARCH64_APPROX_NONE
, /* division */
481 AARCH64_APPROX_NONE
, /* sqrt */
482 AARCH64_APPROX_ALL
/* recip_sqrt */
485 /* Generic prefetch settings (which disable prefetch). */
486 static const cpu_prefetch_tune generic_prefetch_tune
=
489 -1, /* l1_cache_size */
490 -1, /* l1_cache_line_size */
491 -1, /* l2_cache_size */
492 -1 /* default_opt_level */
495 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
498 -1, /* l1_cache_size */
499 64, /* l1_cache_line_size */
500 -1, /* l2_cache_size */
501 -1 /* default_opt_level */
504 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
507 32, /* l1_cache_size */
508 64, /* l1_cache_line_size */
509 1024, /* l2_cache_size */
510 3 /* default_opt_level */
513 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
516 32, /* l1_cache_size */
517 128, /* l1_cache_line_size */
518 16*1024, /* l2_cache_size */
519 3 /* default_opt_level */
522 static const cpu_prefetch_tune thunderx_prefetch_tune
=
525 32, /* l1_cache_size */
526 128, /* l1_cache_line_size */
527 -1, /* l2_cache_size */
528 -1 /* default_opt_level */
531 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
534 32, /* l1_cache_size */
535 64, /* l1_cache_line_size */
536 256, /* l2_cache_size */
537 -1 /* default_opt_level */
540 static const struct tune_params generic_tunings
=
542 &cortexa57_extra_costs
,
543 &generic_addrcost_table
,
544 &generic_regmove_cost
,
545 &generic_vector_cost
,
546 &generic_branch_cost
,
547 &generic_approx_modes
,
550 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
551 8, /* function_align. */
554 2, /* int_reassoc_width. */
555 4, /* fp_reassoc_width. */
556 1, /* vec_reassoc_width. */
557 2, /* min_div_recip_mul_sf. */
558 2, /* min_div_recip_mul_df. */
559 0, /* max_case_values. */
560 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
561 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
562 &generic_prefetch_tune
565 static const struct tune_params cortexa35_tunings
=
567 &cortexa53_extra_costs
,
568 &generic_addrcost_table
,
569 &cortexa53_regmove_cost
,
570 &generic_vector_cost
,
571 &generic_branch_cost
,
572 &generic_approx_modes
,
575 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
576 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
577 16, /* function_align. */
580 2, /* int_reassoc_width. */
581 4, /* fp_reassoc_width. */
582 1, /* vec_reassoc_width. */
583 2, /* min_div_recip_mul_sf. */
584 2, /* min_div_recip_mul_df. */
585 0, /* max_case_values. */
586 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
587 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
588 &generic_prefetch_tune
591 static const struct tune_params cortexa53_tunings
=
593 &cortexa53_extra_costs
,
594 &generic_addrcost_table
,
595 &cortexa53_regmove_cost
,
596 &generic_vector_cost
,
597 &generic_branch_cost
,
598 &generic_approx_modes
,
601 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
602 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
603 16, /* function_align. */
606 2, /* int_reassoc_width. */
607 4, /* fp_reassoc_width. */
608 1, /* vec_reassoc_width. */
609 2, /* min_div_recip_mul_sf. */
610 2, /* min_div_recip_mul_df. */
611 0, /* max_case_values. */
612 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
613 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
614 &generic_prefetch_tune
617 static const struct tune_params cortexa57_tunings
=
619 &cortexa57_extra_costs
,
620 &generic_addrcost_table
,
621 &cortexa57_regmove_cost
,
622 &cortexa57_vector_cost
,
623 &generic_branch_cost
,
624 &generic_approx_modes
,
627 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
628 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
629 16, /* function_align. */
632 2, /* int_reassoc_width. */
633 4, /* fp_reassoc_width. */
634 1, /* vec_reassoc_width. */
635 2, /* min_div_recip_mul_sf. */
636 2, /* min_div_recip_mul_df. */
637 0, /* max_case_values. */
638 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
639 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
640 &generic_prefetch_tune
643 static const struct tune_params cortexa72_tunings
=
645 &cortexa57_extra_costs
,
646 &generic_addrcost_table
,
647 &cortexa57_regmove_cost
,
648 &cortexa57_vector_cost
,
649 &generic_branch_cost
,
650 &generic_approx_modes
,
653 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
654 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
655 16, /* function_align. */
658 2, /* int_reassoc_width. */
659 4, /* fp_reassoc_width. */
660 1, /* vec_reassoc_width. */
661 2, /* min_div_recip_mul_sf. */
662 2, /* min_div_recip_mul_df. */
663 0, /* max_case_values. */
664 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
665 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
666 &generic_prefetch_tune
669 static const struct tune_params cortexa73_tunings
=
671 &cortexa57_extra_costs
,
672 &generic_addrcost_table
,
673 &cortexa57_regmove_cost
,
674 &cortexa57_vector_cost
,
675 &generic_branch_cost
,
676 &generic_approx_modes
,
677 4, /* memmov_cost. */
679 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
680 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
681 16, /* function_align. */
684 2, /* int_reassoc_width. */
685 4, /* fp_reassoc_width. */
686 1, /* vec_reassoc_width. */
687 2, /* min_div_recip_mul_sf. */
688 2, /* min_div_recip_mul_df. */
689 0, /* max_case_values. */
690 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
691 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
692 &generic_prefetch_tune
697 static const struct tune_params exynosm1_tunings
=
699 &exynosm1_extra_costs
,
700 &exynosm1_addrcost_table
,
701 &exynosm1_regmove_cost
,
702 &exynosm1_vector_cost
,
703 &generic_branch_cost
,
704 &exynosm1_approx_modes
,
707 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
708 4, /* function_align. */
711 2, /* int_reassoc_width. */
712 4, /* fp_reassoc_width. */
713 1, /* vec_reassoc_width. */
714 2, /* min_div_recip_mul_sf. */
715 2, /* min_div_recip_mul_df. */
716 48, /* max_case_values. */
717 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
718 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
719 &exynosm1_prefetch_tune
722 static const struct tune_params thunderxt88_tunings
=
724 &thunderx_extra_costs
,
725 &generic_addrcost_table
,
726 &thunderx_regmove_cost
,
727 &thunderx_vector_cost
,
728 &generic_branch_cost
,
729 &generic_approx_modes
,
732 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
733 8, /* function_align. */
736 2, /* int_reassoc_width. */
737 4, /* fp_reassoc_width. */
738 1, /* vec_reassoc_width. */
739 2, /* min_div_recip_mul_sf. */
740 2, /* min_div_recip_mul_df. */
741 0, /* max_case_values. */
742 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
743 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
744 &thunderxt88_prefetch_tune
747 static const struct tune_params thunderx_tunings
=
749 &thunderx_extra_costs
,
750 &generic_addrcost_table
,
751 &thunderx_regmove_cost
,
752 &thunderx_vector_cost
,
753 &generic_branch_cost
,
754 &generic_approx_modes
,
757 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
758 8, /* function_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
769 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
770 &thunderx_prefetch_tune
773 static const struct tune_params xgene1_tunings
=
776 &xgene1_addrcost_table
,
777 &xgene1_regmove_cost
,
779 &generic_branch_cost
,
780 &xgene1_approx_modes
,
783 AARCH64_FUSE_NOTHING
, /* fusible_ops */
784 16, /* function_align. */
786 16, /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params qdf24xx_tunings
=
800 &qdf24xx_extra_costs
,
801 &generic_addrcost_table
,
802 &qdf24xx_regmove_cost
,
803 &generic_vector_cost
,
804 &generic_branch_cost
,
805 &generic_approx_modes
,
808 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
810 16, /* function_align. */
812 16, /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_STRONG
, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
821 &qdf24xx_prefetch_tune
824 static const struct tune_params thunderx2t99_tunings
=
826 &thunderx2t99_extra_costs
,
827 &thunderx2t99_addrcost_table
,
828 &thunderx2t99_regmove_cost
,
829 &thunderx2t99_vector_cost
,
830 &generic_branch_cost
,
831 &generic_approx_modes
,
832 4, /* memmov_cost. */
834 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
835 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
836 16, /* function_align. */
838 16, /* loop_align. */
839 3, /* int_reassoc_width. */
840 2, /* fp_reassoc_width. */
841 2, /* vec_reassoc_width. */
842 2, /* min_div_recip_mul_sf. */
843 2, /* min_div_recip_mul_df. */
844 0, /* max_case_values. */
845 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
846 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
847 &thunderx2t99_prefetch_tune
850 /* Support for fine-grained override of the tuning structures. */
851 struct aarch64_tuning_override_function
854 void (*parse_override
)(const char*, struct tune_params
*);
857 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
858 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
860 static const struct aarch64_tuning_override_function
861 aarch64_tuning_override_functions
[] =
863 { "fuse", aarch64_parse_fuse_string
},
864 { "tune", aarch64_parse_tune_string
},
868 /* A processor implementing AArch64. */
871 const char *const name
;
872 enum aarch64_processor ident
;
873 enum aarch64_processor sched_core
;
874 enum aarch64_arch arch
;
875 unsigned architecture_version
;
876 const unsigned long flags
;
877 const struct tune_params
*const tune
;
880 /* Architectures implementing AArch64. */
881 static const struct processor all_architectures
[] =
883 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
884 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
885 #include "aarch64-arches.def"
886 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
889 /* Processor cores implementing AArch64. */
890 static const struct processor all_cores
[] =
892 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
893 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
894 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
895 FLAGS, &COSTS##_tunings},
896 #include "aarch64-cores.def"
897 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
898 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
899 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
903 /* Target specification. These are populated by the -march, -mtune, -mcpu
904 handling code or by target attributes. */
905 static const struct processor
*selected_arch
;
906 static const struct processor
*selected_cpu
;
907 static const struct processor
*selected_tune
;
909 /* The current tuning set. */
910 struct tune_params aarch64_tune_params
= generic_tunings
;
912 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
914 /* An ISA extension in the co-processor and main instruction set space. */
915 struct aarch64_option_extension
917 const char *const name
;
918 const unsigned long flags_on
;
919 const unsigned long flags_off
;
922 typedef enum aarch64_cond_code
924 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
925 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
926 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
930 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
932 /* The condition codes of the processor, and the inverse function. */
933 static const char * const aarch64_condition_codes
[] =
935 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
936 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
939 /* Generate code to enable conditional branches in functions over 1 MiB. */
941 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
942 const char * branch_format
)
944 rtx_code_label
* tmp_label
= gen_label_rtx ();
947 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
948 CODE_LABEL_NUMBER (tmp_label
));
949 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
950 rtx dest_label
= operands
[pos_label
];
951 operands
[pos_label
] = tmp_label
;
953 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
954 output_asm_insn (buffer
, operands
);
956 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
957 operands
[pos_label
] = dest_label
;
958 output_asm_insn (buffer
, operands
);
963 aarch64_err_no_fpadvsimd (machine_mode mode
, const char *msg
)
965 const char *mc
= FLOAT_MODE_P (mode
) ? "floating-point" : "vector";
966 if (TARGET_GENERAL_REGS_ONLY
)
967 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc
, msg
);
969 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc
, msg
);
972 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
973 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
974 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
975 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
976 cost (in this case the best class is the lowest cost one). Using ALL_REGS
977 irrespectively of its cost results in bad allocations with many redundant
978 int<->FP moves which are expensive on various cores.
979 To avoid this we don't allow ALL_REGS as the allocno class, but force a
980 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
981 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
982 Otherwise set the allocno class depending on the mode.
983 The result of this is that it is no longer inefficient to have a higher
984 memory move cost than the register move cost.
988 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
989 reg_class_t best_class
)
993 if (allocno_class
!= ALL_REGS
)
994 return allocno_class
;
996 if (best_class
!= ALL_REGS
)
999 mode
= PSEUDO_REGNO_MODE (regno
);
1000 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1004 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1006 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1007 return aarch64_tune_params
.min_div_recip_mul_sf
;
1008 return aarch64_tune_params
.min_div_recip_mul_df
;
1012 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
1015 if (VECTOR_MODE_P (mode
))
1016 return aarch64_tune_params
.vec_reassoc_width
;
1017 if (INTEGRAL_MODE_P (mode
))
1018 return aarch64_tune_params
.int_reassoc_width
;
1019 if (FLOAT_MODE_P (mode
))
1020 return aarch64_tune_params
.fp_reassoc_width
;
1024 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1026 aarch64_dbx_register_number (unsigned regno
)
1028 if (GP_REGNUM_P (regno
))
1029 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1030 else if (regno
== SP_REGNUM
)
1031 return AARCH64_DWARF_SP
;
1032 else if (FP_REGNUM_P (regno
))
1033 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1035 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1036 equivalent DWARF register. */
1037 return DWARF_FRAME_REGISTERS
;
1040 /* Return TRUE if MODE is any of the large INT modes. */
1042 aarch64_vect_struct_mode_p (machine_mode mode
)
1044 return mode
== OImode
|| mode
== CImode
|| mode
== XImode
;
1047 /* Return TRUE if MODE is any of the vector modes. */
1049 aarch64_vector_mode_p (machine_mode mode
)
1051 return aarch64_vector_mode_supported_p (mode
)
1052 || aarch64_vect_struct_mode_p (mode
);
1055 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1057 aarch64_array_mode_supported_p (machine_mode mode
,
1058 unsigned HOST_WIDE_INT nelems
)
1061 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1062 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1063 && (nelems
>= 2 && nelems
<= 4))
1069 /* Implement HARD_REGNO_NREGS. */
1072 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1074 switch (aarch64_regno_regclass (regno
))
1078 return (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
;
1080 return (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
1085 /* Implement HARD_REGNO_MODE_OK. */
1088 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1090 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1091 return regno
== CC_REGNUM
;
1093 if (regno
== SP_REGNUM
)
1094 /* The purpose of comparing with ptr_mode is to support the
1095 global register variable associated with the stack pointer
1096 register via the syntax of asm ("wsp") in ILP32. */
1097 return mode
== Pmode
|| mode
== ptr_mode
;
1099 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1100 return mode
== Pmode
;
1102 if (GP_REGNUM_P (regno
) && ! aarch64_vect_struct_mode_p (mode
))
1105 if (FP_REGNUM_P (regno
))
1107 if (aarch64_vect_struct_mode_p (mode
))
1109 (regno
+ aarch64_hard_regno_nregs (regno
, mode
) - 1) <= V31_REGNUM
;
1117 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1119 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned nregs
,
1122 /* Handle modes that fit within single registers. */
1123 if (nregs
== 1 && GET_MODE_SIZE (mode
) <= 16)
1125 if (GET_MODE_SIZE (mode
) >= 4)
1130 /* Fall back to generic for multi-reg and very large modes. */
1132 return choose_hard_reg_mode (regno
, nregs
, false);
1135 /* Return true if calls to DECL should be treated as
1136 long-calls (ie called via a register). */
1138 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1143 /* Return true if calls to symbol-ref SYM should be treated as
1144 long-calls (ie called via a register). */
1146 aarch64_is_long_call_p (rtx sym
)
1148 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1151 /* Return true if calls to symbol-ref SYM should not go through
1155 aarch64_is_noplt_call_p (rtx sym
)
1157 const_tree decl
= SYMBOL_REF_DECL (sym
);
1162 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1163 && !targetm
.binds_local_p (decl
))
1169 /* Return true if the offsets to a zero/sign-extract operation
1170 represent an expression that matches an extend operation. The
1171 operands represent the paramters from
1173 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1175 aarch64_is_extend_from_extract (machine_mode mode
, rtx mult_imm
,
1178 HOST_WIDE_INT mult_val
, extract_val
;
1180 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1183 mult_val
= INTVAL (mult_imm
);
1184 extract_val
= INTVAL (extract_imm
);
1187 && extract_val
< GET_MODE_BITSIZE (mode
)
1188 && exact_log2 (extract_val
& ~7) > 0
1189 && (extract_val
& 7) <= 4
1190 && mult_val
== (1 << (extract_val
& 7)))
1196 /* Emit an insn that's a simple single-set. Both the operands must be
1197 known to be valid. */
1198 inline static rtx_insn
*
1199 emit_set_insn (rtx x
, rtx y
)
1201 return emit_insn (gen_rtx_SET (x
, y
));
1204 /* X and Y are two things to compare using CODE. Emit the compare insn and
1205 return the rtx for register 0 in the proper mode. */
1207 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1209 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1210 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1212 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1216 /* Build the SYMBOL_REF for __tls_get_addr. */
1218 static GTY(()) rtx tls_get_addr_libfunc
;
1221 aarch64_tls_get_addr (void)
1223 if (!tls_get_addr_libfunc
)
1224 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1225 return tls_get_addr_libfunc
;
1228 /* Return the TLS model to use for ADDR. */
1230 static enum tls_model
1231 tls_symbolic_operand_type (rtx addr
)
1233 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1236 if (GET_CODE (addr
) == CONST
)
1238 split_const (addr
, &sym
, &addend
);
1239 if (GET_CODE (sym
) == SYMBOL_REF
)
1240 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1242 else if (GET_CODE (addr
) == SYMBOL_REF
)
1243 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1248 /* We'll allow lo_sum's in addresses in our legitimate addresses
1249 so that combine would take care of combining addresses where
1250 necessary, but for generation purposes, we'll generate the address
1253 tmp = hi (symbol_ref); adrp x1, foo
1254 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1258 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1259 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1263 Load TLS symbol, depending on TLS mechanism and TLS access model.
1265 Global Dynamic - Traditional TLS:
1266 adrp tmp, :tlsgd:imm
1267 add dest, tmp, #:tlsgd_lo12:imm
1270 Global Dynamic - TLS Descriptors:
1271 adrp dest, :tlsdesc:imm
1272 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1273 add dest, dest, #:tlsdesc_lo12:imm
1280 adrp tmp, :gottprel:imm
1281 ldr dest, [tmp, #:gottprel_lo12:imm]
1286 add t0, tp, #:tprel_hi12:imm, lsl #12
1287 add t0, t0, #:tprel_lo12_nc:imm
1291 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1292 enum aarch64_symbol_type type
)
1296 case SYMBOL_SMALL_ABSOLUTE
:
1298 /* In ILP32, the mode of dest can be either SImode or DImode. */
1300 machine_mode mode
= GET_MODE (dest
);
1302 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1304 if (can_create_pseudo_p ())
1305 tmp_reg
= gen_reg_rtx (mode
);
1307 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1308 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1312 case SYMBOL_TINY_ABSOLUTE
:
1313 emit_insn (gen_rtx_SET (dest
, imm
));
1316 case SYMBOL_SMALL_GOT_28K
:
1318 machine_mode mode
= GET_MODE (dest
);
1319 rtx gp_rtx
= pic_offset_table_rtx
;
1323 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1324 here before rtl expand. Tree IVOPT will generate rtl pattern to
1325 decide rtx costs, in which case pic_offset_table_rtx is not
1326 initialized. For that case no need to generate the first adrp
1327 instruction as the final cost for global variable access is
1331 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1332 using the page base as GOT base, the first page may be wasted,
1333 in the worst scenario, there is only 28K space for GOT).
1335 The generate instruction sequence for accessing global variable
1338 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1340 Only one instruction needed. But we must initialize
1341 pic_offset_table_rtx properly. We generate initialize insn for
1342 every global access, and allow CSE to remove all redundant.
1344 The final instruction sequences will look like the following
1345 for multiply global variables access.
1347 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1349 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1350 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1351 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1354 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1355 crtl
->uses_pic_offset_table
= 1;
1356 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1358 if (mode
!= GET_MODE (gp_rtx
))
1359 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1363 if (mode
== ptr_mode
)
1366 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1368 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1370 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1374 gcc_assert (mode
== Pmode
);
1376 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1377 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1380 /* The operand is expected to be MEM. Whenever the related insn
1381 pattern changed, above code which calculate mem should be
1383 gcc_assert (GET_CODE (mem
) == MEM
);
1384 MEM_READONLY_P (mem
) = 1;
1385 MEM_NOTRAP_P (mem
) = 1;
1390 case SYMBOL_SMALL_GOT_4G
:
1392 /* In ILP32, the mode of dest can be either SImode or DImode,
1393 while the got entry is always of SImode size. The mode of
1394 dest depends on how dest is used: if dest is assigned to a
1395 pointer (e.g. in the memory), it has SImode; it may have
1396 DImode if dest is dereferenced to access the memeory.
1397 This is why we have to handle three different ldr_got_small
1398 patterns here (two patterns for ILP32). */
1403 machine_mode mode
= GET_MODE (dest
);
1405 if (can_create_pseudo_p ())
1406 tmp_reg
= gen_reg_rtx (mode
);
1408 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1409 if (mode
== ptr_mode
)
1412 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1414 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1416 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1420 gcc_assert (mode
== Pmode
);
1422 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1423 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1426 gcc_assert (GET_CODE (mem
) == MEM
);
1427 MEM_READONLY_P (mem
) = 1;
1428 MEM_NOTRAP_P (mem
) = 1;
1433 case SYMBOL_SMALL_TLSGD
:
1436 machine_mode mode
= GET_MODE (dest
);
1437 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1441 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1443 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1444 insns
= get_insns ();
1447 RTL_CONST_CALL_P (insns
) = 1;
1448 emit_libcall_block (insns
, dest
, result
, imm
);
1452 case SYMBOL_SMALL_TLSDESC
:
1454 machine_mode mode
= GET_MODE (dest
);
1455 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1458 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1460 /* In ILP32, the got entry is always of SImode size. Unlike
1461 small GOT, the dest is fixed at reg 0. */
1463 emit_insn (gen_tlsdesc_small_si (imm
));
1465 emit_insn (gen_tlsdesc_small_di (imm
));
1466 tp
= aarch64_load_tp (NULL
);
1469 tp
= gen_lowpart (mode
, tp
);
1471 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1472 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1476 case SYMBOL_SMALL_TLSIE
:
1478 /* In ILP32, the mode of dest can be either SImode or DImode,
1479 while the got entry is always of SImode size. The mode of
1480 dest depends on how dest is used: if dest is assigned to a
1481 pointer (e.g. in the memory), it has SImode; it may have
1482 DImode if dest is dereferenced to access the memeory.
1483 This is why we have to handle three different tlsie_small
1484 patterns here (two patterns for ILP32). */
1485 machine_mode mode
= GET_MODE (dest
);
1486 rtx tmp_reg
= gen_reg_rtx (mode
);
1487 rtx tp
= aarch64_load_tp (NULL
);
1489 if (mode
== ptr_mode
)
1492 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1495 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1496 tp
= gen_lowpart (mode
, tp
);
1501 gcc_assert (mode
== Pmode
);
1502 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1505 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1506 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1510 case SYMBOL_TLSLE12
:
1511 case SYMBOL_TLSLE24
:
1512 case SYMBOL_TLSLE32
:
1513 case SYMBOL_TLSLE48
:
1515 machine_mode mode
= GET_MODE (dest
);
1516 rtx tp
= aarch64_load_tp (NULL
);
1519 tp
= gen_lowpart (mode
, tp
);
1523 case SYMBOL_TLSLE12
:
1524 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1527 case SYMBOL_TLSLE24
:
1528 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1531 case SYMBOL_TLSLE32
:
1532 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1534 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1537 case SYMBOL_TLSLE48
:
1538 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1540 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1547 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1551 case SYMBOL_TINY_GOT
:
1552 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1555 case SYMBOL_TINY_TLSIE
:
1557 machine_mode mode
= GET_MODE (dest
);
1558 rtx tp
= aarch64_load_tp (NULL
);
1560 if (mode
== ptr_mode
)
1563 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1566 tp
= gen_lowpart (mode
, tp
);
1567 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1572 gcc_assert (mode
== Pmode
);
1573 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1576 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1585 /* Emit a move from SRC to DEST. Assume that the move expanders can
1586 handle all moves if !can_create_pseudo_p (). The distinction is
1587 important because, unlike emit_move_insn, the move expanders know
1588 how to force Pmode objects into the constant pool even when the
1589 constant pool address is not itself legitimate. */
1591 aarch64_emit_move (rtx dest
, rtx src
)
1593 return (can_create_pseudo_p ()
1594 ? emit_move_insn (dest
, src
)
1595 : emit_move_insn_1 (dest
, src
));
1598 /* Split a 128-bit move operation into two 64-bit move operations,
1599 taking care to handle partial overlap of register to register
1600 copies. Special cases are needed when moving between GP regs and
1601 FP regs. SRC can be a register, constant or memory; DST a register
1602 or memory. If either operand is memory it must not have any side
1605 aarch64_split_128bit_move (rtx dst
, rtx src
)
1610 machine_mode mode
= GET_MODE (dst
);
1612 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1613 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1614 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1616 if (REG_P (dst
) && REG_P (src
))
1618 int src_regno
= REGNO (src
);
1619 int dst_regno
= REGNO (dst
);
1621 /* Handle FP <-> GP regs. */
1622 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1624 src_lo
= gen_lowpart (word_mode
, src
);
1625 src_hi
= gen_highpart (word_mode
, src
);
1629 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1630 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1634 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1635 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1639 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1641 dst_lo
= gen_lowpart (word_mode
, dst
);
1642 dst_hi
= gen_highpart (word_mode
, dst
);
1646 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1647 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1651 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1652 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1658 dst_lo
= gen_lowpart (word_mode
, dst
);
1659 dst_hi
= gen_highpart (word_mode
, dst
);
1660 src_lo
= gen_lowpart (word_mode
, src
);
1661 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1663 /* At most one pairing may overlap. */
1664 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1666 aarch64_emit_move (dst_hi
, src_hi
);
1667 aarch64_emit_move (dst_lo
, src_lo
);
1671 aarch64_emit_move (dst_lo
, src_lo
);
1672 aarch64_emit_move (dst_hi
, src_hi
);
1677 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1679 return (! REG_P (src
)
1680 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1683 /* Split a complex SIMD combine. */
1686 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1688 machine_mode src_mode
= GET_MODE (src1
);
1689 machine_mode dst_mode
= GET_MODE (dst
);
1691 gcc_assert (VECTOR_MODE_P (dst_mode
));
1692 gcc_assert (register_operand (dst
, dst_mode
)
1693 && register_operand (src1
, src_mode
)
1694 && register_operand (src2
, src_mode
));
1696 rtx (*gen
) (rtx
, rtx
, rtx
);
1701 gen
= gen_aarch64_simd_combinev8qi
;
1704 gen
= gen_aarch64_simd_combinev4hi
;
1707 gen
= gen_aarch64_simd_combinev2si
;
1710 gen
= gen_aarch64_simd_combinev4hf
;
1713 gen
= gen_aarch64_simd_combinev2sf
;
1716 gen
= gen_aarch64_simd_combinedi
;
1719 gen
= gen_aarch64_simd_combinedf
;
1725 emit_insn (gen (dst
, src1
, src2
));
1729 /* Split a complex SIMD move. */
1732 aarch64_split_simd_move (rtx dst
, rtx src
)
1734 machine_mode src_mode
= GET_MODE (src
);
1735 machine_mode dst_mode
= GET_MODE (dst
);
1737 gcc_assert (VECTOR_MODE_P (dst_mode
));
1739 if (REG_P (dst
) && REG_P (src
))
1741 rtx (*gen
) (rtx
, rtx
);
1743 gcc_assert (VECTOR_MODE_P (src_mode
));
1748 gen
= gen_aarch64_split_simd_movv16qi
;
1751 gen
= gen_aarch64_split_simd_movv8hi
;
1754 gen
= gen_aarch64_split_simd_movv4si
;
1757 gen
= gen_aarch64_split_simd_movv2di
;
1760 gen
= gen_aarch64_split_simd_movv8hf
;
1763 gen
= gen_aarch64_split_simd_movv4sf
;
1766 gen
= gen_aarch64_split_simd_movv2df
;
1772 emit_insn (gen (dst
, src
));
1778 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
1779 machine_mode ymode
, rtx y
)
1781 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
1782 gcc_assert (r
!= NULL
);
1783 return rtx_equal_p (x
, r
);
1788 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
1790 if (can_create_pseudo_p ())
1791 return force_reg (mode
, value
);
1794 x
= aarch64_emit_move (x
, value
);
1801 aarch64_add_offset (machine_mode mode
, rtx temp
, rtx reg
, HOST_WIDE_INT offset
)
1803 if (!aarch64_plus_immediate (GEN_INT (offset
), mode
))
1806 /* Load the full offset into a register. This
1807 might be improvable in the future. */
1808 high
= GEN_INT (offset
);
1810 high
= aarch64_force_temporary (mode
, temp
, high
);
1811 reg
= aarch64_force_temporary (mode
, temp
,
1812 gen_rtx_PLUS (mode
, high
, reg
));
1814 return plus_constant (mode
, reg
, offset
);
1818 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
1822 unsigned HOST_WIDE_INT val
, val2
, mask
;
1823 int one_match
, zero_match
;
1828 if (aarch64_move_imm (val
, mode
))
1831 emit_insn (gen_rtx_SET (dest
, imm
));
1835 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1836 (with XXXX non-zero). In that case check to see if the move can be done in
1838 val2
= val
& 0xffffffff;
1840 && aarch64_move_imm (val2
, SImode
)
1841 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
1844 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
1846 /* Check if we have to emit a second instruction by checking to see
1847 if any of the upper 32 bits of the original DI mode value is set. */
1851 i
= (val
>> 48) ? 48 : 32;
1854 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1855 GEN_INT ((val
>> i
) & 0xffff)));
1860 if ((val
>> 32) == 0 || mode
== SImode
)
1864 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
1866 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
1867 GEN_INT ((val
>> 16) & 0xffff)));
1869 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
1870 GEN_INT ((val
>> 16) & 0xffff)));
1875 /* Remaining cases are all for DImode. */
1878 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
1879 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
1880 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
1881 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
1883 if (zero_match
!= 2 && one_match
!= 2)
1885 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1886 For a 64-bit bitmask try whether changing 16 bits to all ones or
1887 zeroes creates a valid bitmask. To check any repeated bitmask,
1888 try using 16 bits from the other 32-bit half of val. */
1890 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1893 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1896 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1898 val2
= val2
& ~mask
;
1899 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
1900 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1907 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
1908 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1909 GEN_INT ((val
>> i
) & 0xffff)));
1915 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1916 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1917 otherwise skip zero bits. */
1921 val2
= one_match
> zero_match
? ~val
: val
;
1922 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
1925 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
1926 ? (val
| ~(mask
<< i
))
1927 : (val
& (mask
<< i
)))));
1928 for (i
+= 16; i
< 64; i
+= 16)
1930 if ((val2
& (mask
<< i
)) == 0)
1933 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1934 GEN_INT ((val
>> i
) & 0xffff)));
1943 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
1945 machine_mode mode
= GET_MODE (dest
);
1947 gcc_assert (mode
== SImode
|| mode
== DImode
);
1949 /* Check on what type of symbol it is. */
1950 if (GET_CODE (imm
) == SYMBOL_REF
1951 || GET_CODE (imm
) == LABEL_REF
1952 || GET_CODE (imm
) == CONST
)
1954 rtx mem
, base
, offset
;
1955 enum aarch64_symbol_type sty
;
1957 /* If we have (const (plus symbol offset)), separate out the offset
1958 before we start classifying the symbol. */
1959 split_const (imm
, &base
, &offset
);
1961 sty
= aarch64_classify_symbol (base
, offset
);
1964 case SYMBOL_FORCE_TO_MEM
:
1965 if (offset
!= const0_rtx
1966 && targetm
.cannot_force_const_mem (mode
, imm
))
1968 gcc_assert (can_create_pseudo_p ());
1969 base
= aarch64_force_temporary (mode
, dest
, base
);
1970 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
1971 aarch64_emit_move (dest
, base
);
1975 mem
= force_const_mem (ptr_mode
, imm
);
1978 /* If we aren't generating PC relative literals, then
1979 we need to expand the literal pool access carefully.
1980 This is something that needs to be done in a number
1981 of places, so could well live as a separate function. */
1982 if (!aarch64_pcrelative_literal_loads
)
1984 gcc_assert (can_create_pseudo_p ());
1985 base
= gen_reg_rtx (ptr_mode
);
1986 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
1987 if (ptr_mode
!= Pmode
)
1988 base
= convert_memory_address (Pmode
, base
);
1989 mem
= gen_rtx_MEM (ptr_mode
, base
);
1992 if (mode
!= ptr_mode
)
1993 mem
= gen_rtx_ZERO_EXTEND (mode
, mem
);
1995 emit_insn (gen_rtx_SET (dest
, mem
));
1999 case SYMBOL_SMALL_TLSGD
:
2000 case SYMBOL_SMALL_TLSDESC
:
2001 case SYMBOL_SMALL_TLSIE
:
2002 case SYMBOL_SMALL_GOT_28K
:
2003 case SYMBOL_SMALL_GOT_4G
:
2004 case SYMBOL_TINY_GOT
:
2005 case SYMBOL_TINY_TLSIE
:
2006 if (offset
!= const0_rtx
)
2008 gcc_assert(can_create_pseudo_p ());
2009 base
= aarch64_force_temporary (mode
, dest
, base
);
2010 base
= aarch64_add_offset (mode
, NULL
, base
, INTVAL (offset
));
2011 aarch64_emit_move (dest
, base
);
2016 case SYMBOL_SMALL_ABSOLUTE
:
2017 case SYMBOL_TINY_ABSOLUTE
:
2018 case SYMBOL_TLSLE12
:
2019 case SYMBOL_TLSLE24
:
2020 case SYMBOL_TLSLE32
:
2021 case SYMBOL_TLSLE48
:
2022 aarch64_load_symref_appropriately (dest
, imm
, sty
);
2030 if (!CONST_INT_P (imm
))
2032 if (GET_CODE (imm
) == HIGH
)
2033 emit_insn (gen_rtx_SET (dest
, imm
));
2036 rtx mem
= force_const_mem (mode
, imm
);
2038 emit_insn (gen_rtx_SET (dest
, mem
));
2044 aarch64_internal_mov_immediate (dest
, imm
, true, GET_MODE (dest
));
2047 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2048 temporary value if necessary. FRAME_RELATED_P should be true if
2049 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2050 to the generated instructions. If SCRATCHREG is known to hold
2051 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2054 Since this function may be used to adjust the stack pointer, we must
2055 ensure that it cannot cause transient stack deallocation (for example
2056 by first incrementing SP and then decrementing when adjusting by a
2057 large immediate). */
2060 aarch64_add_constant_internal (machine_mode mode
, int regnum
, int scratchreg
,
2061 HOST_WIDE_INT delta
, bool frame_related_p
,
2064 HOST_WIDE_INT mdelta
= abs_hwi (delta
);
2065 rtx this_rtx
= gen_rtx_REG (mode
, regnum
);
2071 /* Single instruction adjustment. */
2072 if (aarch64_uimm12_shift (mdelta
))
2074 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
)));
2075 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2079 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2080 Only do this if mdelta is not a 16-bit move as adjusting using a move
2082 if (mdelta
< 0x1000000 && !aarch64_move_imm (mdelta
, mode
))
2084 HOST_WIDE_INT low_off
= mdelta
& 0xfff;
2086 low_off
= delta
< 0 ? -low_off
: low_off
;
2087 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (low_off
)));
2088 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2089 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
- low_off
)));
2090 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2094 /* Emit a move immediate if required and an addition/subtraction. */
2095 rtx scratch_rtx
= gen_rtx_REG (mode
, scratchreg
);
2097 aarch64_internal_mov_immediate (scratch_rtx
, GEN_INT (mdelta
), true, mode
);
2098 insn
= emit_insn (delta
< 0 ? gen_sub2_insn (this_rtx
, scratch_rtx
)
2099 : gen_add2_insn (this_rtx
, scratch_rtx
));
2100 if (frame_related_p
)
2102 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2103 rtx adj
= plus_constant (mode
, this_rtx
, delta
);
2104 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (this_rtx
, adj
));
2109 aarch64_add_constant (machine_mode mode
, int regnum
, int scratchreg
,
2110 HOST_WIDE_INT delta
)
2112 aarch64_add_constant_internal (mode
, regnum
, scratchreg
, delta
, false, true);
2116 aarch64_add_sp (int scratchreg
, HOST_WIDE_INT delta
, bool emit_move_imm
)
2118 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, delta
,
2119 true, emit_move_imm
);
2123 aarch64_sub_sp (int scratchreg
, HOST_WIDE_INT delta
, bool frame_related_p
)
2125 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, -delta
,
2126 frame_related_p
, true);
2130 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
2131 tree exp ATTRIBUTE_UNUSED
)
2133 /* Currently, always true. */
2137 /* Implement TARGET_PASS_BY_REFERENCE. */
2140 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
2143 bool named ATTRIBUTE_UNUSED
)
2146 machine_mode dummymode
;
2149 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2150 size
= (mode
== BLKmode
&& type
)
2151 ? int_size_in_bytes (type
) : (int) GET_MODE_SIZE (mode
);
2153 /* Aggregates are passed by reference based on their size. */
2154 if (type
&& AGGREGATE_TYPE_P (type
))
2156 size
= int_size_in_bytes (type
);
2159 /* Variable sized arguments are always returned by reference. */
2163 /* Can this be a candidate to be passed in fp/simd register(s)? */
2164 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2169 /* Arguments which are variable sized or larger than 2 registers are
2170 passed by reference unless they are a homogenous floating point
2172 return size
> 2 * UNITS_PER_WORD
;
2175 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2177 aarch64_return_in_msb (const_tree valtype
)
2179 machine_mode dummy_mode
;
2182 /* Never happens in little-endian mode. */
2183 if (!BYTES_BIG_ENDIAN
)
2186 /* Only composite types smaller than or equal to 16 bytes can
2187 be potentially returned in registers. */
2188 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
2189 || int_size_in_bytes (valtype
) <= 0
2190 || int_size_in_bytes (valtype
) > 16)
2193 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2194 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2195 is always passed/returned in the least significant bits of fp/simd
2197 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
2198 &dummy_mode
, &dummy_int
, NULL
))
2204 /* Implement TARGET_FUNCTION_VALUE.
2205 Define how to find the value returned by a function. */
2208 aarch64_function_value (const_tree type
, const_tree func
,
2209 bool outgoing ATTRIBUTE_UNUSED
)
2214 machine_mode ag_mode
;
2216 mode
= TYPE_MODE (type
);
2217 if (INTEGRAL_TYPE_P (type
))
2218 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
2220 if (aarch64_return_in_msb (type
))
2222 HOST_WIDE_INT size
= int_size_in_bytes (type
);
2224 if (size
% UNITS_PER_WORD
!= 0)
2226 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
2227 mode
= mode_for_size (size
* BITS_PER_UNIT
, MODE_INT
, 0);
2231 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2232 &ag_mode
, &count
, NULL
))
2234 if (!aarch64_composite_type_p (type
, mode
))
2236 gcc_assert (count
== 1 && mode
== ag_mode
);
2237 return gen_rtx_REG (mode
, V0_REGNUM
);
2244 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
2245 for (i
= 0; i
< count
; i
++)
2247 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
2248 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2249 GEN_INT (i
* GET_MODE_SIZE (ag_mode
)));
2250 XVECEXP (par
, 0, i
) = tmp
;
2256 return gen_rtx_REG (mode
, R0_REGNUM
);
2259 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2260 Return true if REGNO is the number of a hard register in which the values
2261 of called function may come back. */
2264 aarch64_function_value_regno_p (const unsigned int regno
)
2266 /* Maximum of 16 bytes can be returned in the general registers. Examples
2267 of 16-byte return values are: 128-bit integers and 16-byte small
2268 structures (excluding homogeneous floating-point aggregates). */
2269 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
2272 /* Up to four fp/simd registers can return a function value, e.g. a
2273 homogeneous floating-point aggregate having four members. */
2274 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
2275 return TARGET_FLOAT
;
2280 /* Implement TARGET_RETURN_IN_MEMORY.
2282 If the type T of the result of a function is such that
2284 would require that arg be passed as a value in a register (or set of
2285 registers) according to the parameter passing rules, then the result
2286 is returned in the same registers as would be used for such an
2290 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
2293 machine_mode ag_mode
;
2296 if (!AGGREGATE_TYPE_P (type
)
2297 && TREE_CODE (type
) != COMPLEX_TYPE
2298 && TREE_CODE (type
) != VECTOR_TYPE
)
2299 /* Simple scalar types always returned in registers. */
2302 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
2309 /* Types larger than 2 registers returned in memory. */
2310 size
= int_size_in_bytes (type
);
2311 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
2315 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
2316 const_tree type
, int *nregs
)
2318 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2319 return aarch64_vfp_is_call_or_return_candidate (mode
,
2321 &pcum
->aapcs_vfp_rmode
,
2326 /* Given MODE and TYPE of a function argument, return the alignment in
2327 bits. The idea is to suppress any stronger alignment requested by
2328 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2329 This is a helper function for local use only. */
2332 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
2335 return GET_MODE_ALIGNMENT (mode
);
2337 if (integer_zerop (TYPE_SIZE (type
)))
2340 gcc_assert (TYPE_MODE (type
) == mode
);
2342 if (!AGGREGATE_TYPE_P (type
))
2343 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
2345 if (TREE_CODE (type
) == ARRAY_TYPE
)
2346 return TYPE_ALIGN (TREE_TYPE (type
));
2348 unsigned int alignment
= 0;
2349 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
2350 if (TREE_CODE (field
) == FIELD_DECL
)
2351 alignment
= std::max (alignment
, DECL_ALIGN (field
));
2356 /* Layout a function argument according to the AAPCS64 rules. The rule
2357 numbers refer to the rule numbers in the AAPCS64. */
2360 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2362 bool named ATTRIBUTE_UNUSED
)
2364 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2365 int ncrn
, nvrn
, nregs
;
2366 bool allocate_ncrn
, allocate_nvrn
;
2369 /* We need to do this once per argument. */
2370 if (pcum
->aapcs_arg_processed
)
2373 pcum
->aapcs_arg_processed
= true;
2375 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2377 = ROUND_UP (type
? int_size_in_bytes (type
) : GET_MODE_SIZE (mode
),
2380 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
2381 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
2386 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2387 The following code thus handles passing by SIMD/FP registers first. */
2389 nvrn
= pcum
->aapcs_nvrn
;
2391 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2392 and homogenous short-vector aggregates (HVA). */
2396 aarch64_err_no_fpadvsimd (mode
, "argument");
2398 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
2400 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
2401 if (!aarch64_composite_type_p (type
, mode
))
2403 gcc_assert (nregs
== 1);
2404 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
2410 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2411 for (i
= 0; i
< nregs
; i
++)
2413 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
2414 V0_REGNUM
+ nvrn
+ i
);
2415 tmp
= gen_rtx_EXPR_LIST
2417 GEN_INT (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
)));
2418 XVECEXP (par
, 0, i
) = tmp
;
2420 pcum
->aapcs_reg
= par
;
2426 /* C.3 NSRN is set to 8. */
2427 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
2432 ncrn
= pcum
->aapcs_ncrn
;
2433 nregs
= size
/ UNITS_PER_WORD
;
2435 /* C6 - C9. though the sign and zero extension semantics are
2436 handled elsewhere. This is the case where the argument fits
2437 entirely general registers. */
2438 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
2441 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
2443 /* C.8 if the argument has an alignment of 16 then the NGRN is
2444 rounded up to the next even number. */
2447 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2448 comparison is there because for > 16 * BITS_PER_UNIT
2449 alignment nregs should be > 2 and therefore it should be
2450 passed by reference rather than value. */
2451 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2454 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
2457 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2458 A reg is still generated for it, but the caller should be smart
2459 enough not to use it. */
2460 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
2461 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
2467 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2468 for (i
= 0; i
< nregs
; i
++)
2470 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
2471 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2472 GEN_INT (i
* UNITS_PER_WORD
));
2473 XVECEXP (par
, 0, i
) = tmp
;
2475 pcum
->aapcs_reg
= par
;
2478 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
2483 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
2485 /* The argument is passed on stack; record the needed number of words for
2486 this argument and align the total size if necessary. */
2488 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
2490 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2491 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
2492 16 / UNITS_PER_WORD
);
2496 /* Implement TARGET_FUNCTION_ARG. */
2499 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2500 const_tree type
, bool named
)
2502 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2503 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
2505 if (mode
== VOIDmode
)
2508 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2509 return pcum
->aapcs_reg
;
2513 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
2514 const_tree fntype ATTRIBUTE_UNUSED
,
2515 rtx libname ATTRIBUTE_UNUSED
,
2516 const_tree fndecl ATTRIBUTE_UNUSED
,
2517 unsigned n_named ATTRIBUTE_UNUSED
)
2519 pcum
->aapcs_ncrn
= 0;
2520 pcum
->aapcs_nvrn
= 0;
2521 pcum
->aapcs_nextncrn
= 0;
2522 pcum
->aapcs_nextnvrn
= 0;
2523 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
2524 pcum
->aapcs_reg
= NULL_RTX
;
2525 pcum
->aapcs_arg_processed
= false;
2526 pcum
->aapcs_stack_words
= 0;
2527 pcum
->aapcs_stack_size
= 0;
2530 && fndecl
&& TREE_PUBLIC (fndecl
)
2531 && fntype
&& fntype
!= error_mark_node
)
2533 const_tree type
= TREE_TYPE (fntype
);
2534 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
2535 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
2536 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
2537 &mode
, &nregs
, NULL
))
2538 aarch64_err_no_fpadvsimd (TYPE_MODE (type
), "return type");
2544 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
2549 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2550 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
2552 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2553 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
2554 != (pcum
->aapcs_stack_words
!= 0));
2555 pcum
->aapcs_arg_processed
= false;
2556 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
2557 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
2558 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
2559 pcum
->aapcs_stack_words
= 0;
2560 pcum
->aapcs_reg
= NULL_RTX
;
2565 aarch64_function_arg_regno_p (unsigned regno
)
2567 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
2568 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
2571 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2572 PARM_BOUNDARY bits of alignment, but will be given anything up
2573 to STACK_BOUNDARY bits if the type requires it. This makes sure
2574 that both before and after the layout of each argument, the Next
2575 Stacked Argument Address (NSAA) will have a minimum alignment of
2579 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
2581 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
2582 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
2585 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2587 Return true if an argument passed on the stack should be padded upwards,
2588 i.e. if the least-significant byte of the stack slot has useful data.
2590 Small aggregate types are placed in the lowest memory address.
2592 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2595 aarch64_pad_arg_upward (machine_mode mode
, const_tree type
)
2597 /* On little-endian targets, the least significant byte of every stack
2598 argument is passed at the lowest byte address of the stack slot. */
2599 if (!BYTES_BIG_ENDIAN
)
2602 /* Otherwise, integral, floating-point and pointer types are padded downward:
2603 the least significant byte of a stack argument is passed at the highest
2604 byte address of the stack slot. */
2606 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
2607 || POINTER_TYPE_P (type
))
2608 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
2611 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2615 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2617 It specifies padding for the last (may also be the only)
2618 element of a block move between registers and memory. If
2619 assuming the block is in the memory, padding upward means that
2620 the last element is padded after its highest significant byte,
2621 while in downward padding, the last element is padded at the
2622 its least significant byte side.
2624 Small aggregates and small complex types are always padded
2627 We don't need to worry about homogeneous floating-point or
2628 short-vector aggregates; their move is not affected by the
2629 padding direction determined here. Regardless of endianness,
2630 each element of such an aggregate is put in the least
2631 significant bits of a fp/simd register.
2633 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2634 register has useful data, and return the opposite if the most
2635 significant byte does. */
2638 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
2639 bool first ATTRIBUTE_UNUSED
)
2642 /* Small composite types are always padded upward. */
2643 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
2645 HOST_WIDE_INT size
= (type
? int_size_in_bytes (type
)
2646 : GET_MODE_SIZE (mode
));
2647 if (size
< 2 * UNITS_PER_WORD
)
2651 /* Otherwise, use the default padding. */
2652 return !BYTES_BIG_ENDIAN
;
2656 aarch64_libgcc_cmp_return_mode (void)
2661 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2663 /* We use the 12-bit shifted immediate arithmetic instructions so values
2664 must be multiple of (1 << 12), i.e. 4096. */
2665 #define ARITH_FACTOR 4096
2667 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2668 #error Cannot use simple address calculation for stack probing
2671 /* The pair of scratch registers used for stack probing. */
2672 #define PROBE_STACK_FIRST_REG 9
2673 #define PROBE_STACK_SECOND_REG 10
2675 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2676 inclusive. These are offsets from the current stack pointer. */
2679 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, HOST_WIDE_INT size
)
2681 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
2683 /* See the same assertion on PROBE_INTERVAL above. */
2684 gcc_assert ((first
% ARITH_FACTOR
) == 0);
2686 /* See if we have a constant small number of probes to generate. If so,
2687 that's the easy case. */
2688 if (size
<= PROBE_INTERVAL
)
2690 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
2692 emit_set_insn (reg1
,
2693 plus_constant (Pmode
,
2694 stack_pointer_rtx
, -(first
+ base
)));
2695 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
2698 /* The run-time loop is made up of 8 insns in the generic case while the
2699 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2700 else if (size
<= 4 * PROBE_INTERVAL
)
2702 HOST_WIDE_INT i
, rem
;
2704 emit_set_insn (reg1
,
2705 plus_constant (Pmode
,
2707 -(first
+ PROBE_INTERVAL
)));
2708 emit_stack_probe (reg1
);
2710 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2711 it exceeds SIZE. If only two probes are needed, this will not
2712 generate any code. Then probe at FIRST + SIZE. */
2713 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
2715 emit_set_insn (reg1
,
2716 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
2717 emit_stack_probe (reg1
);
2720 rem
= size
- (i
- PROBE_INTERVAL
);
2723 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2725 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
2726 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
2729 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
2732 /* Otherwise, do the same as above, but in a loop. Note that we must be
2733 extra careful with variables wrapping around because we might be at
2734 the very top (or the very bottom) of the address space and we have
2735 to be able to handle this case properly; in particular, we use an
2736 equality test for the loop condition. */
2739 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
2741 /* Step 1: round SIZE to the previous multiple of the interval. */
2743 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
2746 /* Step 2: compute initial and final value of the loop counter. */
2748 /* TEST_ADDR = SP + FIRST. */
2749 emit_set_insn (reg1
,
2750 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
2752 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2753 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
2754 if (! aarch64_uimm12_shift (adjustment
))
2756 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
2758 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
2762 emit_set_insn (reg2
,
2763 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
2770 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2773 while (TEST_ADDR != LAST_ADDR)
2775 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2776 until it is equal to ROUNDED_SIZE. */
2778 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
2781 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2782 that SIZE is equal to ROUNDED_SIZE. */
2784 if (size
!= rounded_size
)
2786 HOST_WIDE_INT rem
= size
- rounded_size
;
2790 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2792 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
2793 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
2796 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
2800 /* Make sure nothing is scheduled before we are done. */
2801 emit_insn (gen_blockage ());
2804 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2805 absolute addresses. */
2808 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
2810 static int labelno
= 0;
2814 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
2817 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
2819 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2821 xops
[1] = GEN_INT (PROBE_INTERVAL
);
2822 output_asm_insn ("sub\t%0, %0, %1", xops
);
2824 /* Probe at TEST_ADDR. */
2825 output_asm_insn ("str\txzr, [%0]", xops
);
2827 /* Test if TEST_ADDR == LAST_ADDR. */
2829 output_asm_insn ("cmp\t%0, %1", xops
);
2832 fputs ("\tb.ne\t", asm_out_file
);
2833 assemble_name_raw (asm_out_file
, loop_lab
);
2834 fputc ('\n', asm_out_file
);
2840 aarch64_frame_pointer_required (void)
2842 /* In aarch64_override_options_after_change
2843 flag_omit_leaf_frame_pointer turns off the frame pointer by
2844 default. Turn it back on now if we've not got a leaf
2846 if (flag_omit_leaf_frame_pointer
2847 && (!crtl
->is_leaf
|| df_regs_ever_live_p (LR_REGNUM
)))
2850 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2851 if (crtl
->calls_eh_return
)
2857 /* Mark the registers that need to be saved by the callee and calculate
2858 the size of the callee-saved registers area and frame record (both FP
2859 and LR may be omitted). */
2861 aarch64_layout_frame (void)
2863 HOST_WIDE_INT offset
= 0;
2864 int regno
, last_fp_reg
= INVALID_REGNUM
;
2866 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
2869 #define SLOT_NOT_REQUIRED (-2)
2870 #define SLOT_REQUIRED (-1)
2872 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
2873 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
2875 /* First mark all the registers that really need to be saved... */
2876 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2877 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2879 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2880 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2882 /* ... that includes the eh data registers (if needed)... */
2883 if (crtl
->calls_eh_return
)
2884 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
2885 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
2888 /* ... and any callee saved register that dataflow says is live. */
2889 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2890 if (df_regs_ever_live_p (regno
)
2891 && (regno
== R30_REGNUM
2892 || !call_used_regs
[regno
]))
2893 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2895 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2896 if (df_regs_ever_live_p (regno
)
2897 && !call_used_regs
[regno
])
2899 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2900 last_fp_reg
= regno
;
2903 if (frame_pointer_needed
)
2905 /* FP and LR are placed in the linkage record. */
2906 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
2907 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
2908 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
2909 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
2910 offset
+= 2 * UNITS_PER_WORD
;
2913 /* Now assign stack slots for them. */
2914 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2915 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2917 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2918 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2919 cfun
->machine
->frame
.wb_candidate1
= regno
;
2920 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
2921 cfun
->machine
->frame
.wb_candidate2
= regno
;
2922 offset
+= UNITS_PER_WORD
;
2925 HOST_WIDE_INT max_int_offset
= offset
;
2926 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2927 bool has_align_gap
= offset
!= max_int_offset
;
2929 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2930 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2932 /* If there is an alignment gap between integer and fp callee-saves,
2933 allocate the last fp register to it if possible. */
2934 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
2936 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
2940 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2941 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2942 cfun
->machine
->frame
.wb_candidate1
= regno
;
2943 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
2944 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
2945 cfun
->machine
->frame
.wb_candidate2
= regno
;
2946 offset
+= UNITS_PER_WORD
;
2949 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2951 cfun
->machine
->frame
.saved_regs_size
= offset
;
2953 HOST_WIDE_INT varargs_and_saved_regs_size
2954 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
2956 cfun
->machine
->frame
.hard_fp_offset
2957 = ROUND_UP (varargs_and_saved_regs_size
+ get_frame_size (),
2958 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2960 cfun
->machine
->frame
.frame_size
2961 = ROUND_UP (cfun
->machine
->frame
.hard_fp_offset
2962 + crtl
->outgoing_args_size
,
2963 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2965 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
2967 cfun
->machine
->frame
.initial_adjust
= 0;
2968 cfun
->machine
->frame
.final_adjust
= 0;
2969 cfun
->machine
->frame
.callee_adjust
= 0;
2970 cfun
->machine
->frame
.callee_offset
= 0;
2972 HOST_WIDE_INT max_push_offset
= 0;
2973 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
2974 max_push_offset
= 512;
2975 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
2976 max_push_offset
= 256;
2978 if (cfun
->machine
->frame
.frame_size
< max_push_offset
2979 && crtl
->outgoing_args_size
== 0)
2981 /* Simple, small frame with no outgoing arguments:
2982 stp reg1, reg2, [sp, -frame_size]!
2983 stp reg3, reg4, [sp, 16] */
2984 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.frame_size
;
2986 else if ((crtl
->outgoing_args_size
2987 + cfun
->machine
->frame
.saved_regs_size
< 512)
2988 && !(cfun
->calls_alloca
2989 && cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
))
2991 /* Frame with small outgoing arguments:
2992 sub sp, sp, frame_size
2993 stp reg1, reg2, [sp, outgoing_args_size]
2994 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2995 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
2996 cfun
->machine
->frame
.callee_offset
2997 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
2999 else if (cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
)
3001 /* Frame with large outgoing arguments but a small local area:
3002 stp reg1, reg2, [sp, -hard_fp_offset]!
3003 stp reg3, reg4, [sp, 16]
3004 sub sp, sp, outgoing_args_size */
3005 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3006 cfun
->machine
->frame
.final_adjust
3007 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
3009 else if (!frame_pointer_needed
3010 && varargs_and_saved_regs_size
< max_push_offset
)
3012 /* Frame with large local area and outgoing arguments (this pushes the
3013 callee-saves first, followed by the locals and outgoing area):
3014 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
3015 stp reg3, reg4, [sp, 16]
3016 sub sp, sp, frame_size - varargs_and_saved_regs_size */
3017 cfun
->machine
->frame
.callee_adjust
= varargs_and_saved_regs_size
;
3018 cfun
->machine
->frame
.final_adjust
3019 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
3020 cfun
->machine
->frame
.hard_fp_offset
= cfun
->machine
->frame
.callee_adjust
;
3021 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.hard_fp_offset
;
3025 /* Frame with large local area and outgoing arguments using frame pointer:
3026 sub sp, sp, hard_fp_offset
3027 stp x29, x30, [sp, 0]
3029 stp reg3, reg4, [sp, 16]
3030 sub sp, sp, outgoing_args_size */
3031 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3032 cfun
->machine
->frame
.final_adjust
3033 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
3036 cfun
->machine
->frame
.laid_out
= true;
3039 /* Return true if the register REGNO is saved on entry to
3040 the current function. */
3043 aarch64_register_saved_on_entry (int regno
)
3045 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
3048 /* Return the next register up from REGNO up to LIMIT for the callee
3052 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
3054 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
3059 /* Push the register number REGNO of mode MODE to the stack with write-back
3060 adjusting the stack by ADJUSTMENT. */
3063 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
3064 HOST_WIDE_INT adjustment
)
3066 rtx base_rtx
= stack_pointer_rtx
;
3069 reg
= gen_rtx_REG (mode
, regno
);
3070 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
3071 plus_constant (Pmode
, base_rtx
, -adjustment
));
3072 mem
= gen_rtx_MEM (mode
, mem
);
3074 insn
= emit_move_insn (mem
, reg
);
3075 RTX_FRAME_RELATED_P (insn
) = 1;
3078 /* Generate and return an instruction to store the pair of registers
3079 REG and REG2 of mode MODE to location BASE with write-back adjusting
3080 the stack location BASE by ADJUSTMENT. */
3083 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3084 HOST_WIDE_INT adjustment
)
3089 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
3090 GEN_INT (-adjustment
),
3091 GEN_INT (UNITS_PER_WORD
- adjustment
));
3093 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
3094 GEN_INT (-adjustment
),
3095 GEN_INT (UNITS_PER_WORD
- adjustment
));
3101 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3102 stack pointer by ADJUSTMENT. */
3105 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
3108 machine_mode mode
= (regno1
<= R30_REGNUM
) ? DImode
: DFmode
;
3110 if (regno2
== INVALID_REGNUM
)
3111 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
3113 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3114 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3116 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
3118 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
3119 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3120 RTX_FRAME_RELATED_P (insn
) = 1;
3123 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3124 adjusting it by ADJUSTMENT afterwards. */
3127 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3128 HOST_WIDE_INT adjustment
)
3133 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3134 GEN_INT (UNITS_PER_WORD
));
3136 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3137 GEN_INT (UNITS_PER_WORD
));
3143 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3144 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3148 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
3151 machine_mode mode
= (regno1
<= R30_REGNUM
) ? DImode
: DFmode
;
3152 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3154 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
3156 if (regno2
== INVALID_REGNUM
)
3158 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
3159 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
3160 emit_move_insn (reg1
, gen_rtx_MEM (mode
, mem
));
3164 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3165 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3166 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
3171 /* Generate and return a store pair instruction of mode MODE to store
3172 register REG1 to MEM1 and register REG2 to MEM2. */
3175 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
3181 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
3184 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
3191 /* Generate and regurn a load pair isntruction of mode MODE to load register
3192 REG1 from MEM1 and register REG2 from MEM2. */
3195 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
3201 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
3204 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
3211 /* Return TRUE if return address signing should be enabled for the current
3212 function, otherwise return FALSE. */
3215 aarch64_return_address_signing_enabled (void)
3217 /* This function should only be called after frame laid out. */
3218 gcc_assert (cfun
->machine
->frame
.laid_out
);
3220 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3221 if it's LR is pushed onto stack. */
3222 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
3223 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
3224 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
3227 /* Emit code to save the callee-saved registers from register number START
3228 to LIMIT to the stack at the location starting at offset START_OFFSET,
3229 skipping any write-back candidates if SKIP_WB is true. */
3232 aarch64_save_callee_saves (machine_mode mode
, HOST_WIDE_INT start_offset
,
3233 unsigned start
, unsigned limit
, bool skip_wb
)
3236 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
3237 ? gen_frame_mem
: gen_rtx_MEM
);
3241 for (regno
= aarch64_next_callee_save (start
, limit
);
3243 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3246 HOST_WIDE_INT offset
;
3249 && (regno
== cfun
->machine
->frame
.wb_candidate1
3250 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3253 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
3256 reg
= gen_rtx_REG (mode
, regno
);
3257 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3258 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3261 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3264 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
3265 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3266 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3269 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3272 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3273 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3275 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
3278 /* The first part of a frame-related parallel insn is
3279 always assumed to be relevant to the frame
3280 calculations; subsequent parts, are only
3281 frame-related if explicitly marked. */
3282 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3286 insn
= emit_move_insn (mem
, reg
);
3288 RTX_FRAME_RELATED_P (insn
) = 1;
3292 /* Emit code to restore the callee registers of mode MODE from register
3293 number START up to and including LIMIT. Restore from the stack offset
3294 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3295 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3298 aarch64_restore_callee_saves (machine_mode mode
,
3299 HOST_WIDE_INT start_offset
, unsigned start
,
3300 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
3302 rtx base_rtx
= stack_pointer_rtx
;
3303 rtx (*gen_mem_ref
) (machine_mode
, rtx
) = (frame_pointer_needed
3304 ? gen_frame_mem
: gen_rtx_MEM
);
3307 HOST_WIDE_INT offset
;
3309 for (regno
= aarch64_next_callee_save (start
, limit
);
3311 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3313 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
3319 && (regno
== cfun
->machine
->frame
.wb_candidate1
3320 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3323 reg
= gen_rtx_REG (mode
, regno
);
3324 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3325 mem
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3327 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3330 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
3331 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3332 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3334 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3337 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3338 mem2
= gen_mem_ref (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3339 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3341 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3345 emit_move_insn (reg
, mem
);
3346 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
3351 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
3352 HOST_WIDE_INT offset
)
3354 return offset
>= -256 && offset
< 256;
3358 offset_12bit_unsigned_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3361 && offset
< 4096 * GET_MODE_SIZE (mode
)
3362 && offset
% GET_MODE_SIZE (mode
) == 0);
3366 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3368 return (offset
>= -64 * GET_MODE_SIZE (mode
)
3369 && offset
< 64 * GET_MODE_SIZE (mode
)
3370 && offset
% GET_MODE_SIZE (mode
) == 0);
3373 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3376 aarch64_get_separate_components (void)
3378 aarch64_layout_frame ();
3380 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
3381 bitmap_clear (components
);
3383 /* The registers we need saved to the frame. */
3384 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3385 if (aarch64_register_saved_on_entry (regno
))
3387 HOST_WIDE_INT offset
= cfun
->machine
->frame
.reg_offset
[regno
];
3388 if (!frame_pointer_needed
)
3389 offset
+= cfun
->machine
->frame
.frame_size
3390 - cfun
->machine
->frame
.hard_fp_offset
;
3391 /* Check that we can access the stack slot of the register with one
3392 direct load with no adjustments needed. */
3393 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
3394 bitmap_set_bit (components
, regno
);
3397 /* Don't mess with the hard frame pointer. */
3398 if (frame_pointer_needed
)
3399 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
3401 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3402 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3403 /* If aarch64_layout_frame has chosen registers to store/restore with
3404 writeback don't interfere with them to avoid having to output explicit
3405 stack adjustment instructions. */
3406 if (reg2
!= INVALID_REGNUM
)
3407 bitmap_clear_bit (components
, reg2
);
3408 if (reg1
!= INVALID_REGNUM
)
3409 bitmap_clear_bit (components
, reg1
);
3411 bitmap_clear_bit (components
, LR_REGNUM
);
3412 bitmap_clear_bit (components
, SP_REGNUM
);
3417 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3420 aarch64_components_for_bb (basic_block bb
)
3422 bitmap in
= DF_LIVE_IN (bb
);
3423 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
3424 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
3426 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
3427 bitmap_clear (components
);
3429 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3430 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3431 if ((!call_used_regs
[regno
])
3432 && (bitmap_bit_p (in
, regno
)
3433 || bitmap_bit_p (gen
, regno
)
3434 || bitmap_bit_p (kill
, regno
)))
3435 bitmap_set_bit (components
, regno
);
3440 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3441 Nothing to do for aarch64. */
3444 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
3448 /* Return the next set bit in BMP from START onwards. Return the total number
3449 of bits in BMP if no set bit is found at or after START. */
3452 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
3454 unsigned int nbits
= SBITMAP_SIZE (bmp
);
3458 gcc_assert (start
< nbits
);
3459 for (unsigned int i
= start
; i
< nbits
; i
++)
3460 if (bitmap_bit_p (bmp
, i
))
3466 /* Do the work for aarch64_emit_prologue_components and
3467 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3468 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3469 for these components or the epilogue sequence. That is, it determines
3470 whether we should emit stores or loads and what kind of CFA notes to attach
3471 to the insns. Otherwise the logic for the two sequences is very
3475 aarch64_process_components (sbitmap components
, bool prologue_p
)
3477 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
3478 ? HARD_FRAME_POINTER_REGNUM
3479 : STACK_POINTER_REGNUM
);
3481 unsigned last_regno
= SBITMAP_SIZE (components
);
3482 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
3483 rtx_insn
*insn
= NULL
;
3485 while (regno
!= last_regno
)
3487 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3488 so DFmode for the vector registers is enough. */
3489 machine_mode mode
= GP_REGNUM_P (regno
) ? DImode
: DFmode
;
3490 rtx reg
= gen_rtx_REG (mode
, regno
);
3491 HOST_WIDE_INT offset
= cfun
->machine
->frame
.reg_offset
[regno
];
3492 if (!frame_pointer_needed
)
3493 offset
+= cfun
->machine
->frame
.frame_size
3494 - cfun
->machine
->frame
.hard_fp_offset
;
3495 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
3496 rtx mem
= gen_frame_mem (mode
, addr
);
3498 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
3499 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
3500 /* No more registers to handle after REGNO.
3501 Emit a single save/restore and exit. */
3502 if (regno2
== last_regno
)
3504 insn
= emit_insn (set
);
3505 RTX_FRAME_RELATED_P (insn
) = 1;
3507 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
3509 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3513 HOST_WIDE_INT offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
3514 /* The next register is not of the same class or its offset is not
3515 mergeable with the current one into a pair. */
3516 if (!satisfies_constraint_Ump (mem
)
3517 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
3518 || (offset2
- cfun
->machine
->frame
.reg_offset
[regno
])
3519 != GET_MODE_SIZE (mode
))
3521 insn
= emit_insn (set
);
3522 RTX_FRAME_RELATED_P (insn
) = 1;
3524 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
3526 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3532 /* REGNO2 can be saved/restored in a pair with REGNO. */
3533 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3534 if (!frame_pointer_needed
)
3535 offset2
+= cfun
->machine
->frame
.frame_size
3536 - cfun
->machine
->frame
.hard_fp_offset
;
3537 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
3538 rtx mem2
= gen_frame_mem (mode
, addr2
);
3539 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
3540 : gen_rtx_SET (reg2
, mem2
);
3543 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
3545 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3547 RTX_FRAME_RELATED_P (insn
) = 1;
3550 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
3551 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
3555 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3556 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
3559 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
3563 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3566 aarch64_emit_prologue_components (sbitmap components
)
3568 aarch64_process_components (components
, true);
3571 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3574 aarch64_emit_epilogue_components (sbitmap components
)
3576 aarch64_process_components (components
, false);
3579 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3582 aarch64_set_handled_components (sbitmap components
)
3584 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3585 if (bitmap_bit_p (components
, regno
))
3586 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
3589 /* AArch64 stack frames generated by this compiler look like:
3591 +-------------------------------+
3593 | incoming stack arguments |
3595 +-------------------------------+
3596 | | <-- incoming stack pointer (aligned)
3597 | callee-allocated save area |
3598 | for register varargs |
3600 +-------------------------------+
3601 | local variables | <-- frame_pointer_rtx
3603 +-------------------------------+
3605 +-------------------------------+ |
3606 | callee-saved registers | | frame.saved_regs_size
3607 +-------------------------------+ |
3609 +-------------------------------+ |
3610 | FP' | / <- hard_frame_pointer_rtx (aligned)
3611 +-------------------------------+
3612 | dynamic allocation |
3613 +-------------------------------+
3615 +-------------------------------+
3616 | outgoing stack arguments | <-- arg_pointer
3618 +-------------------------------+
3619 | | <-- stack_pointer_rtx (aligned)
3621 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3622 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3625 /* Generate the prologue instructions for entry into a function.
3626 Establish the stack frame by decreasing the stack pointer with a
3627 properly calculated size and, if necessary, create a frame record
3628 filled with the values of LR and previous frame pointer. The
3629 current FP is also set up if it is in use. */
3632 aarch64_expand_prologue (void)
3634 aarch64_layout_frame ();
3636 HOST_WIDE_INT frame_size
= cfun
->machine
->frame
.frame_size
;
3637 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3638 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3639 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3640 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3641 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3642 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3645 /* Sign return address for functions. */
3646 if (aarch64_return_address_signing_enabled ())
3648 insn
= emit_insn (gen_pacisp ());
3649 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
3650 RTX_FRAME_RELATED_P (insn
) = 1;
3653 if (flag_stack_usage_info
)
3654 current_function_static_stack_size
= frame_size
;
3656 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
3658 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
3660 if (frame_size
> PROBE_INTERVAL
&& frame_size
> STACK_CHECK_PROTECT
)
3661 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT
,
3662 frame_size
- STACK_CHECK_PROTECT
);
3664 else if (frame_size
> 0)
3665 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT
, frame_size
);
3668 aarch64_sub_sp (IP0_REGNUM
, initial_adjust
, true);
3670 if (callee_adjust
!= 0)
3671 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
3673 if (frame_pointer_needed
)
3675 if (callee_adjust
== 0)
3676 aarch64_save_callee_saves (DImode
, callee_offset
, R29_REGNUM
,
3678 insn
= emit_insn (gen_add3_insn (hard_frame_pointer_rtx
,
3680 GEN_INT (callee_offset
)));
3681 RTX_FRAME_RELATED_P (insn
) = 1;
3682 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
3685 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3686 callee_adjust
!= 0 || frame_pointer_needed
);
3687 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3688 callee_adjust
!= 0 || frame_pointer_needed
);
3689 aarch64_sub_sp (IP1_REGNUM
, final_adjust
, !frame_pointer_needed
);
3692 /* Return TRUE if we can use a simple_return insn.
3694 This function checks whether the callee saved stack is empty, which
3695 means no restore actions are need. The pro_and_epilogue will use
3696 this to check whether shrink-wrapping opt is feasible. */
3699 aarch64_use_return_insn_p (void)
3701 if (!reload_completed
)
3707 aarch64_layout_frame ();
3709 return cfun
->machine
->frame
.frame_size
== 0;
3712 /* Generate the epilogue instructions for returning from a function.
3713 This is almost exactly the reverse of the prolog sequence, except
3714 that we need to insert barriers to avoid scheduling loads that read
3715 from a deallocated stack, and we optimize the unwind records by
3716 emitting them all together if possible. */
3718 aarch64_expand_epilogue (bool for_sibcall
)
3720 aarch64_layout_frame ();
3722 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3723 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3724 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3725 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3726 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3727 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3731 /* We need to add memory barrier to prevent read from deallocated stack. */
3732 bool need_barrier_p
= (get_frame_size ()
3733 + cfun
->machine
->frame
.saved_varargs_size
) != 0;
3735 /* Emit a barrier to prevent loads from a deallocated stack. */
3736 if (final_adjust
> crtl
->outgoing_args_size
|| cfun
->calls_alloca
3737 || crtl
->calls_eh_return
)
3739 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3740 need_barrier_p
= false;
3743 /* Restore the stack pointer from the frame pointer if it may not
3744 be the same as the stack pointer. */
3745 if (frame_pointer_needed
&& (final_adjust
|| cfun
->calls_alloca
))
3747 insn
= emit_insn (gen_add3_insn (stack_pointer_rtx
,
3748 hard_frame_pointer_rtx
,
3749 GEN_INT (-callee_offset
)));
3750 /* If writeback is used when restoring callee-saves, the CFA
3751 is restored on the instruction doing the writeback. */
3752 RTX_FRAME_RELATED_P (insn
) = callee_adjust
== 0;
3755 aarch64_add_sp (IP1_REGNUM
, final_adjust
, df_regs_ever_live_p (IP1_REGNUM
));
3757 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3758 callee_adjust
!= 0, &cfi_ops
);
3759 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3760 callee_adjust
!= 0, &cfi_ops
);
3763 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3765 if (callee_adjust
!= 0)
3766 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
3768 if (callee_adjust
!= 0 || initial_adjust
> 65536)
3770 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3771 insn
= get_last_insn ();
3772 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
3773 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
3774 RTX_FRAME_RELATED_P (insn
) = 1;
3778 aarch64_add_sp (IP0_REGNUM
, initial_adjust
, df_regs_ever_live_p (IP0_REGNUM
));
3782 /* Emit delayed restores and reset the CFA to be SP. */
3783 insn
= get_last_insn ();
3784 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
3785 REG_NOTES (insn
) = cfi_ops
;
3786 RTX_FRAME_RELATED_P (insn
) = 1;
3789 /* We prefer to emit the combined return/authenticate instruction RETAA,
3790 however there are three cases in which we must instead emit an explicit
3791 authentication instruction.
3793 1) Sibcalls don't return in a normal way, so if we're about to call one
3794 we must authenticate.
3796 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3797 generating code for !TARGET_ARMV8_3 we can't use it and must
3798 explicitly authenticate.
3800 3) On an eh_return path we make extra stack adjustments to update the
3801 canonical frame address to be the exception handler's CFA. We want
3802 to authenticate using the CFA of the function which calls eh_return.
3804 if (aarch64_return_address_signing_enabled ()
3805 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
3807 insn
= emit_insn (gen_autisp ());
3808 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
3809 RTX_FRAME_RELATED_P (insn
) = 1;
3812 /* Stack adjustment for exception handler. */
3813 if (crtl
->calls_eh_return
)
3815 /* We need to unwind the stack by the offset computed by
3816 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3817 to be SP; letting the CFA move during this adjustment
3818 is just as correct as retaining the CFA from the body
3819 of the function. Therefore, do nothing special. */
3820 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
3823 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
3825 emit_jump_insn (ret_rtx
);
3828 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3829 normally or return to a previous frame after unwinding.
3831 An EH return uses a single shared return sequence. The epilogue is
3832 exactly like a normal epilogue except that it has an extra input
3833 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3834 that must be applied after the frame has been destroyed. An extra label
3835 is inserted before the epilogue which initializes this register to zero,
3836 and this is the entry point for a normal return.
3838 An actual EH return updates the return address, initializes the stack
3839 adjustment and jumps directly into the epilogue (bypassing the zeroing
3840 of the adjustment). Since the return address is typically saved on the
3841 stack when a function makes a call, the saved LR must be updated outside
3844 This poses problems as the store is generated well before the epilogue,
3845 so the offset of LR is not known yet. Also optimizations will remove the
3846 store as it appears dead, even after the epilogue is generated (as the
3847 base or offset for loading LR is different in many cases).
3849 To avoid these problems this implementation forces the frame pointer
3850 in eh_return functions so that the location of LR is fixed and known early.
3851 It also marks the store volatile, so no optimization is permitted to
3852 remove the store. */
3854 aarch64_eh_return_handler_rtx (void)
3856 rtx tmp
= gen_frame_mem (Pmode
,
3857 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
3859 /* Mark the store volatile, so no optimization is permitted to remove it. */
3860 MEM_VOLATILE_P (tmp
) = true;
3864 /* Output code to add DELTA to the first argument, and then jump
3865 to FUNCTION. Used for C++ multiple inheritance. */
3867 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
3868 HOST_WIDE_INT delta
,
3869 HOST_WIDE_INT vcall_offset
,
3872 /* The this pointer is always in x0. Note that this differs from
3873 Arm where the this pointer maybe bumped to r1 if r0 is required
3874 to return a pointer to an aggregate. On AArch64 a result value
3875 pointer will be in x8. */
3876 int this_regno
= R0_REGNUM
;
3877 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
3880 reload_completed
= 1;
3881 emit_note (NOTE_INSN_PROLOGUE_END
);
3883 if (vcall_offset
== 0)
3884 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3887 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
3889 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
3890 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
3891 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
3896 if (delta
>= -256 && delta
< 256)
3897 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
3898 plus_constant (Pmode
, this_rtx
, delta
));
3900 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3903 if (Pmode
== ptr_mode
)
3904 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
3906 aarch64_emit_move (temp0
,
3907 gen_rtx_ZERO_EXTEND (Pmode
,
3908 gen_rtx_MEM (ptr_mode
, addr
)));
3910 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
3911 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
3914 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
3916 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
3919 if (Pmode
== ptr_mode
)
3920 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
3922 aarch64_emit_move (temp1
,
3923 gen_rtx_SIGN_EXTEND (Pmode
,
3924 gen_rtx_MEM (ptr_mode
, addr
)));
3926 emit_insn (gen_add2_insn (this_rtx
, temp1
));
3929 /* Generate a tail call to the target function. */
3930 if (!TREE_USED (function
))
3932 assemble_external (function
);
3933 TREE_USED (function
) = 1;
3935 funexp
= XEXP (DECL_RTL (function
), 0);
3936 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
3937 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
3938 SIBLING_CALL_P (insn
) = 1;
3940 insn
= get_insns ();
3941 shorten_branches (insn
);
3942 final_start_function (insn
, file
, 1);
3943 final (insn
, file
, 1);
3944 final_end_function ();
3946 /* Stop pretending to be a post-reload pass. */
3947 reload_completed
= 0;
3951 aarch64_tls_referenced_p (rtx x
)
3953 if (!TARGET_HAVE_TLS
)
3955 subrtx_iterator::array_type array
;
3956 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
3958 const_rtx x
= *iter
;
3959 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
3961 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3962 TLS offsets, not real symbol references. */
3963 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
3964 iter
.skip_subrtxes ();
3970 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3971 a left shift of 0 or 12 bits. */
3973 aarch64_uimm12_shift (HOST_WIDE_INT val
)
3975 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
3976 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
3981 /* Return true if val is an immediate that can be loaded into a
3982 register by a MOVZ instruction. */
3984 aarch64_movw_imm (HOST_WIDE_INT val
, machine_mode mode
)
3986 if (GET_MODE_SIZE (mode
) > 4)
3988 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
3989 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
3994 /* Ignore sign extension. */
3995 val
&= (HOST_WIDE_INT
) 0xffffffff;
3997 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
3998 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
4001 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4003 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
4005 0x0000000100000001ull
,
4006 0x0001000100010001ull
,
4007 0x0101010101010101ull
,
4008 0x1111111111111111ull
,
4009 0x5555555555555555ull
,
4013 /* Return true if val is a valid bitmask immediate. */
4016 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
4018 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
4021 /* Check for a single sequence of one bits and return quickly if so.
4022 The special cases of all ones and all zeroes returns false. */
4023 val
= (unsigned HOST_WIDE_INT
) val_in
;
4024 tmp
= val
+ (val
& -val
);
4026 if (tmp
== (tmp
& -tmp
))
4027 return (val
+ 1) > 1;
4029 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4031 val
= (val
<< 32) | (val
& 0xffffffff);
4033 /* Invert if the immediate doesn't start with a zero bit - this means we
4034 only need to search for sequences of one bits. */
4038 /* Find the first set bit and set tmp to val with the first sequence of one
4039 bits removed. Return success if there is a single sequence of ones. */
4040 first_one
= val
& -val
;
4041 tmp
= val
& (val
+ first_one
);
4046 /* Find the next set bit and compute the difference in bit position. */
4047 next_one
= tmp
& -tmp
;
4048 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
4051 /* Check the bit position difference is a power of 2, and that the first
4052 sequence of one bits fits within 'bits' bits. */
4053 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
4056 /* Check the sequence of one bits is repeated 64/bits times. */
4057 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
4060 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4061 Assumed precondition: VAL_IN Is not zero. */
4063 unsigned HOST_WIDE_INT
4064 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
4066 int lowest_bit_set
= ctz_hwi (val_in
);
4067 int highest_bit_set
= floor_log2 (val_in
);
4068 gcc_assert (val_in
!= 0);
4070 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
4071 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
4074 /* Create constant where bits outside of lowest bit set to highest bit set
4077 unsigned HOST_WIDE_INT
4078 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
4080 return val_in
| ~aarch64_and_split_imm1 (val_in
);
4083 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4086 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
4088 if (aarch64_bitmask_imm (val_in
, mode
))
4091 if (aarch64_move_imm (val_in
, mode
))
4094 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
4096 return aarch64_bitmask_imm (imm2
, mode
);
4099 /* Return true if val is an immediate that can be loaded into a
4100 register in a single instruction. */
4102 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
4104 if (aarch64_movw_imm (val
, mode
) || aarch64_movw_imm (~val
, mode
))
4106 return aarch64_bitmask_imm (val
, mode
);
4110 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
4114 if (GET_CODE (x
) == HIGH
)
4117 split_const (x
, &base
, &offset
);
4118 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
4120 if (aarch64_classify_symbol (base
, offset
)
4121 != SYMBOL_FORCE_TO_MEM
)
4124 /* Avoid generating a 64-bit relocation in ILP32; leave
4125 to aarch64_expand_mov_immediate to handle it properly. */
4126 return mode
!= ptr_mode
;
4129 return aarch64_tls_referenced_p (x
);
4132 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4133 The expansion for a table switch is quite expensive due to the number
4134 of instructions, the table lookup and hard to predict indirect jump.
4135 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4136 set, otherwise use tables for > 16 cases as a tradeoff between size and
4137 performance. When optimizing for size, use the default setting. */
4140 aarch64_case_values_threshold (void)
4142 /* Use the specified limit for the number of cases before using jump
4143 tables at higher optimization levels. */
4145 && selected_cpu
->tune
->max_case_values
!= 0)
4146 return selected_cpu
->tune
->max_case_values
;
4148 return optimize_size
? default_case_values_threshold () : 17;
4151 /* Return true if register REGNO is a valid index register.
4152 STRICT_P is true if REG_OK_STRICT is in effect. */
4155 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
4157 if (!HARD_REGISTER_NUM_P (regno
))
4165 regno
= reg_renumber
[regno
];
4167 return GP_REGNUM_P (regno
);
4170 /* Return true if register REGNO is a valid base register for mode MODE.
4171 STRICT_P is true if REG_OK_STRICT is in effect. */
4174 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
4176 if (!HARD_REGISTER_NUM_P (regno
))
4184 regno
= reg_renumber
[regno
];
4187 /* The fake registers will be eliminated to either the stack or
4188 hard frame pointer, both of which are usually valid base registers.
4189 Reload deals with the cases where the eliminated form isn't valid. */
4190 return (GP_REGNUM_P (regno
)
4191 || regno
== SP_REGNUM
4192 || regno
== FRAME_POINTER_REGNUM
4193 || regno
== ARG_POINTER_REGNUM
);
4196 /* Return true if X is a valid base register for mode MODE.
4197 STRICT_P is true if REG_OK_STRICT is in effect. */
4200 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
4202 if (!strict_p
&& GET_CODE (x
) == SUBREG
)
4205 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
4208 /* Return true if address offset is a valid index. If it is, fill in INFO
4209 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4212 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
4213 machine_mode mode
, bool strict_p
)
4215 enum aarch64_address_type type
;
4220 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
4221 && GET_MODE (x
) == Pmode
)
4223 type
= ADDRESS_REG_REG
;
4227 /* (sign_extend:DI (reg:SI)) */
4228 else if ((GET_CODE (x
) == SIGN_EXTEND
4229 || GET_CODE (x
) == ZERO_EXTEND
)
4230 && GET_MODE (x
) == DImode
4231 && GET_MODE (XEXP (x
, 0)) == SImode
)
4233 type
= (GET_CODE (x
) == SIGN_EXTEND
)
4234 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4235 index
= XEXP (x
, 0);
4238 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4239 else if (GET_CODE (x
) == MULT
4240 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
4241 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
4242 && GET_MODE (XEXP (x
, 0)) == DImode
4243 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
4244 && CONST_INT_P (XEXP (x
, 1)))
4246 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
4247 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4248 index
= XEXP (XEXP (x
, 0), 0);
4249 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
4251 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4252 else if (GET_CODE (x
) == ASHIFT
4253 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
4254 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
4255 && GET_MODE (XEXP (x
, 0)) == DImode
4256 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
4257 && CONST_INT_P (XEXP (x
, 1)))
4259 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
4260 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4261 index
= XEXP (XEXP (x
, 0), 0);
4262 shift
= INTVAL (XEXP (x
, 1));
4264 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4265 else if ((GET_CODE (x
) == SIGN_EXTRACT
4266 || GET_CODE (x
) == ZERO_EXTRACT
)
4267 && GET_MODE (x
) == DImode
4268 && GET_CODE (XEXP (x
, 0)) == MULT
4269 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4270 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
4272 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
4273 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4274 index
= XEXP (XEXP (x
, 0), 0);
4275 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
4276 if (INTVAL (XEXP (x
, 1)) != 32 + shift
4277 || INTVAL (XEXP (x
, 2)) != 0)
4280 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4281 (const_int 0xffffffff<<shift)) */
4282 else if (GET_CODE (x
) == AND
4283 && GET_MODE (x
) == DImode
4284 && GET_CODE (XEXP (x
, 0)) == MULT
4285 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4286 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4287 && CONST_INT_P (XEXP (x
, 1)))
4289 type
= ADDRESS_REG_UXTW
;
4290 index
= XEXP (XEXP (x
, 0), 0);
4291 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
4292 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
4295 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4296 else if ((GET_CODE (x
) == SIGN_EXTRACT
4297 || GET_CODE (x
) == ZERO_EXTRACT
)
4298 && GET_MODE (x
) == DImode
4299 && GET_CODE (XEXP (x
, 0)) == ASHIFT
4300 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4301 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
4303 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
4304 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4305 index
= XEXP (XEXP (x
, 0), 0);
4306 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
4307 if (INTVAL (XEXP (x
, 1)) != 32 + shift
4308 || INTVAL (XEXP (x
, 2)) != 0)
4311 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4312 (const_int 0xffffffff<<shift)) */
4313 else if (GET_CODE (x
) == AND
4314 && GET_MODE (x
) == DImode
4315 && GET_CODE (XEXP (x
, 0)) == ASHIFT
4316 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4317 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4318 && CONST_INT_P (XEXP (x
, 1)))
4320 type
= ADDRESS_REG_UXTW
;
4321 index
= XEXP (XEXP (x
, 0), 0);
4322 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
4323 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
4326 /* (mult:P (reg:P) (const_int scale)) */
4327 else if (GET_CODE (x
) == MULT
4328 && GET_MODE (x
) == Pmode
4329 && GET_MODE (XEXP (x
, 0)) == Pmode
4330 && CONST_INT_P (XEXP (x
, 1)))
4332 type
= ADDRESS_REG_REG
;
4333 index
= XEXP (x
, 0);
4334 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
4336 /* (ashift:P (reg:P) (const_int shift)) */
4337 else if (GET_CODE (x
) == ASHIFT
4338 && GET_MODE (x
) == Pmode
4339 && GET_MODE (XEXP (x
, 0)) == Pmode
4340 && CONST_INT_P (XEXP (x
, 1)))
4342 type
= ADDRESS_REG_REG
;
4343 index
= XEXP (x
, 0);
4344 shift
= INTVAL (XEXP (x
, 1));
4349 if (GET_CODE (index
) == SUBREG
)
4350 index
= SUBREG_REG (index
);
4353 (shift
> 0 && shift
<= 3
4354 && (1 << shift
) == GET_MODE_SIZE (mode
)))
4356 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
4359 info
->offset
= index
;
4360 info
->shift
= shift
;
4367 /* Return true if MODE is one of the modes for which we
4368 support LDP/STP operations. */
4371 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
4373 return mode
== SImode
|| mode
== DImode
4374 || mode
== SFmode
|| mode
== DFmode
4375 || (aarch64_vector_mode_supported_p (mode
)
4376 && GET_MODE_SIZE (mode
) == 8);
4379 /* Return true if REGNO is a virtual pointer register, or an eliminable
4380 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4381 include stack_pointer or hard_frame_pointer. */
4383 virt_or_elim_regno_p (unsigned regno
)
4385 return ((regno
>= FIRST_VIRTUAL_REGISTER
4386 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
4387 || regno
== FRAME_POINTER_REGNUM
4388 || regno
== ARG_POINTER_REGNUM
);
4391 /* Return true if X is a valid address for machine mode MODE. If it is,
4392 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4393 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4396 aarch64_classify_address (struct aarch64_address_info
*info
,
4397 rtx x
, machine_mode mode
,
4398 RTX_CODE outer_code
, bool strict_p
)
4400 enum rtx_code code
= GET_CODE (x
);
4403 /* On BE, we use load/store pair for all large int mode load/stores.
4404 TI/TFmode may also use a load/store pair. */
4405 bool load_store_pair_p
= (outer_code
== PARALLEL
4408 || (BYTES_BIG_ENDIAN
4409 && aarch64_vect_struct_mode_p (mode
)));
4411 bool allow_reg_index_p
=
4413 && (GET_MODE_SIZE (mode
) != 16 || aarch64_vector_mode_supported_p (mode
))
4414 && !aarch64_vect_struct_mode_p (mode
);
4416 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4418 if (aarch64_vect_struct_mode_p (mode
) && !BYTES_BIG_ENDIAN
4419 && (code
!= POST_INC
&& code
!= REG
))
4426 info
->type
= ADDRESS_REG_IMM
;
4428 info
->offset
= const0_rtx
;
4429 return aarch64_base_register_rtx_p (x
, strict_p
);
4437 && virt_or_elim_regno_p (REGNO (op0
))
4438 && CONST_INT_P (op1
))
4440 info
->type
= ADDRESS_REG_IMM
;
4447 if (GET_MODE_SIZE (mode
) != 0
4448 && CONST_INT_P (op1
)
4449 && aarch64_base_register_rtx_p (op0
, strict_p
))
4451 HOST_WIDE_INT offset
= INTVAL (op1
);
4453 info
->type
= ADDRESS_REG_IMM
;
4457 /* TImode and TFmode values are allowed in both pairs of X
4458 registers and individual Q registers. The available
4460 X,X: 7-bit signed scaled offset
4461 Q: 9-bit signed offset
4462 We conservatively require an offset representable in either mode.
4463 When performing the check for pairs of X registers i.e. LDP/STP
4464 pass down DImode since that is the natural size of the LDP/STP
4465 instruction memory accesses. */
4466 if (mode
== TImode
|| mode
== TFmode
)
4467 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
4468 && (offset_9bit_signed_unscaled_p (mode
, offset
)
4469 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
4471 /* A 7bit offset check because OImode will emit a ldp/stp
4472 instruction (only big endian will get here).
4473 For ldp/stp instructions, the offset is scaled for the size of a
4474 single element of the pair. */
4476 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
4478 /* Three 9/12 bit offsets checks because CImode will emit three
4479 ldr/str instructions (only big endian will get here). */
4481 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4482 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
4483 || offset_12bit_unsigned_scaled_p (V16QImode
,
4486 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4487 instructions (only big endian will get here). */
4489 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4490 && aarch64_offset_7bit_signed_scaled_p (TImode
,
4493 if (load_store_pair_p
)
4494 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4495 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4497 return (offset_9bit_signed_unscaled_p (mode
, offset
)
4498 || offset_12bit_unsigned_scaled_p (mode
, offset
));
4501 if (allow_reg_index_p
)
4503 /* Look for base + (scaled/extended) index register. */
4504 if (aarch64_base_register_rtx_p (op0
, strict_p
)
4505 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
4510 if (aarch64_base_register_rtx_p (op1
, strict_p
)
4511 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
4524 info
->type
= ADDRESS_REG_WB
;
4525 info
->base
= XEXP (x
, 0);
4526 info
->offset
= NULL_RTX
;
4527 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
4531 info
->type
= ADDRESS_REG_WB
;
4532 info
->base
= XEXP (x
, 0);
4533 if (GET_CODE (XEXP (x
, 1)) == PLUS
4534 && CONST_INT_P (XEXP (XEXP (x
, 1), 1))
4535 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
4536 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4538 HOST_WIDE_INT offset
;
4539 info
->offset
= XEXP (XEXP (x
, 1), 1);
4540 offset
= INTVAL (info
->offset
);
4542 /* TImode and TFmode values are allowed in both pairs of X
4543 registers and individual Q registers. The available
4545 X,X: 7-bit signed scaled offset
4546 Q: 9-bit signed offset
4547 We conservatively require an offset representable in either mode.
4549 if (mode
== TImode
|| mode
== TFmode
)
4550 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
4551 && offset_9bit_signed_unscaled_p (mode
, offset
));
4553 if (load_store_pair_p
)
4554 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4555 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4557 return offset_9bit_signed_unscaled_p (mode
, offset
);
4564 /* load literal: pc-relative constant pool entry. Only supported
4565 for SI mode or larger. */
4566 info
->type
= ADDRESS_SYMBOLIC
;
4568 if (!load_store_pair_p
&& GET_MODE_SIZE (mode
) >= 4)
4572 split_const (x
, &sym
, &addend
);
4573 return ((GET_CODE (sym
) == LABEL_REF
4574 || (GET_CODE (sym
) == SYMBOL_REF
4575 && CONSTANT_POOL_ADDRESS_P (sym
)
4576 && aarch64_pcrelative_literal_loads
)));
4581 info
->type
= ADDRESS_LO_SUM
;
4582 info
->base
= XEXP (x
, 0);
4583 info
->offset
= XEXP (x
, 1);
4584 if (allow_reg_index_p
4585 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4588 split_const (info
->offset
, &sym
, &offs
);
4589 if (GET_CODE (sym
) == SYMBOL_REF
4590 && (aarch64_classify_symbol (sym
, offs
) == SYMBOL_SMALL_ABSOLUTE
))
4592 /* The symbol and offset must be aligned to the access size. */
4594 unsigned int ref_size
;
4596 if (CONSTANT_POOL_ADDRESS_P (sym
))
4597 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
4598 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
4600 tree exp
= SYMBOL_REF_DECL (sym
);
4601 align
= TYPE_ALIGN (TREE_TYPE (exp
));
4602 align
= CONSTANT_ALIGNMENT (exp
, align
);
4604 else if (SYMBOL_REF_DECL (sym
))
4605 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
4606 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
4607 && SYMBOL_REF_BLOCK (sym
) != NULL
)
4608 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
4610 align
= BITS_PER_UNIT
;
4612 ref_size
= GET_MODE_SIZE (mode
);
4614 ref_size
= GET_MODE_SIZE (DImode
);
4616 return ((INTVAL (offs
) & (ref_size
- 1)) == 0
4617 && ((align
/ BITS_PER_UNIT
) & (ref_size
- 1)) == 0);
4627 /* Return true if the address X is valid for a PRFM instruction.
4628 STRICT_P is true if we should do strict checking with
4629 aarch64_classify_address. */
4632 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
4634 struct aarch64_address_info addr
;
4636 /* PRFM accepts the same addresses as DImode... */
4637 bool res
= aarch64_classify_address (&addr
, x
, DImode
, MEM
, strict_p
);
4641 /* ... except writeback forms. */
4642 return addr
.type
!= ADDRESS_REG_WB
;
4646 aarch64_symbolic_address_p (rtx x
)
4650 split_const (x
, &x
, &offset
);
4651 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
4654 /* Classify the base of symbolic expression X. */
4656 enum aarch64_symbol_type
4657 aarch64_classify_symbolic_expression (rtx x
)
4661 split_const (x
, &x
, &offset
);
4662 return aarch64_classify_symbol (x
, offset
);
4666 /* Return TRUE if X is a legitimate address for accessing memory in
4669 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
4671 struct aarch64_address_info addr
;
4673 return aarch64_classify_address (&addr
, x
, mode
, MEM
, strict_p
);
4676 /* Return TRUE if X is a legitimate address for accessing memory in
4677 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4680 aarch64_legitimate_address_p (machine_mode mode
, rtx x
,
4681 RTX_CODE outer_code
, bool strict_p
)
4683 struct aarch64_address_info addr
;
4685 return aarch64_classify_address (&addr
, x
, mode
, outer_code
, strict_p
);
4688 /* Split an out-of-range address displacement into a base and offset.
4689 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4690 to increase opportunities for sharing the base address of different sizes.
4691 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4693 aarch64_legitimize_address_displacement (rtx
*disp
, rtx
*off
, machine_mode mode
)
4695 HOST_WIDE_INT offset
= INTVAL (*disp
);
4696 HOST_WIDE_INT base
= offset
& ~(GET_MODE_SIZE (mode
) < 4 ? 0xfff : 0x3ffc);
4698 if (mode
== TImode
|| mode
== TFmode
4699 || (offset
& (GET_MODE_SIZE (mode
) - 1)) != 0)
4700 base
= (offset
+ 0x100) & ~0x1ff;
4702 *off
= GEN_INT (base
);
4703 *disp
= GEN_INT (offset
- base
);
4707 /* Return the binary representation of floating point constant VALUE in INTVAL.
4708 If the value cannot be converted, return false without setting INTVAL.
4709 The conversion is done in the given MODE. */
4711 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
4714 /* We make a general exception for 0. */
4715 if (aarch64_float_const_zero_rtx_p (value
))
4721 machine_mode mode
= GET_MODE (value
);
4722 if (GET_CODE (value
) != CONST_DOUBLE
4723 || !SCALAR_FLOAT_MODE_P (mode
)
4724 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
4725 /* Only support up to DF mode. */
4726 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
4729 unsigned HOST_WIDE_INT ival
= 0;
4732 real_to_target (res
,
4733 CONST_DOUBLE_REAL_VALUE (value
),
4734 REAL_MODE_FORMAT (mode
));
4736 ival
= zext_hwi (res
[0], 32);
4737 if (GET_MODE_BITSIZE (mode
) == GET_MODE_BITSIZE (DFmode
))
4738 ival
|= (zext_hwi (res
[1], 32) << 32);
4744 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4745 single MOV(+MOVK) followed by an FMOV. */
4747 aarch64_float_const_rtx_p (rtx x
)
4749 machine_mode mode
= GET_MODE (x
);
4750 if (mode
== VOIDmode
)
4753 /* Determine whether it's cheaper to write float constants as
4754 mov/movk pairs over ldr/adrp pairs. */
4755 unsigned HOST_WIDE_INT ival
;
4757 if (GET_CODE (x
) == CONST_DOUBLE
4758 && SCALAR_FLOAT_MODE_P (mode
)
4759 && aarch64_reinterpret_float_as_int (x
, &ival
))
4761 machine_mode imode
= mode
== HFmode
? SImode
: int_mode_for_mode (mode
);
4762 int num_instr
= aarch64_internal_mov_immediate
4763 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
4764 return num_instr
< 3;
4770 /* Return TRUE if rtx X is immediate constant 0.0 */
4772 aarch64_float_const_zero_rtx_p (rtx x
)
4774 if (GET_MODE (x
) == VOIDmode
)
4777 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
4778 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
4779 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
4782 /* Return TRUE if rtx X is immediate constant that fits in a single
4783 MOVI immediate operation. */
4785 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
4790 machine_mode vmode
, imode
;
4791 unsigned HOST_WIDE_INT ival
;
4793 if (GET_CODE (x
) == CONST_DOUBLE
4794 && SCALAR_FLOAT_MODE_P (mode
))
4796 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
4799 /* We make a general exception for 0. */
4800 if (aarch64_float_const_zero_rtx_p (x
))
4803 imode
= int_mode_for_mode (mode
);
4805 else if (GET_CODE (x
) == CONST_INT
4806 && SCALAR_INT_MODE_P (mode
))
4814 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4815 a 128 bit vector mode. */
4816 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
4818 vmode
= aarch64_simd_container_mode (imode
, width
);
4819 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
4821 return aarch64_simd_valid_immediate (v_op
, vmode
, false, NULL
);
4825 /* Return the fixed registers used for condition codes. */
4828 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
4831 *p2
= INVALID_REGNUM
;
4835 /* This function is used by the call expanders of the machine description.
4836 RESULT is the register in which the result is returned. It's NULL for
4837 "call" and "sibcall".
4838 MEM is the location of the function call.
4839 SIBCALL indicates whether this function call is normal call or sibling call.
4840 It will generate different pattern accordingly. */
4843 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
4845 rtx call
, callee
, tmp
;
4849 gcc_assert (MEM_P (mem
));
4850 callee
= XEXP (mem
, 0);
4851 mode
= GET_MODE (callee
);
4852 gcc_assert (mode
== Pmode
);
4854 /* Decide if we should generate indirect calls by loading the
4855 address of the callee into a register before performing
4856 the branch-and-link. */
4857 if (SYMBOL_REF_P (callee
)
4858 ? (aarch64_is_long_call_p (callee
)
4859 || aarch64_is_noplt_call_p (callee
))
4861 XEXP (mem
, 0) = force_reg (mode
, callee
);
4863 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
4865 if (result
!= NULL_RTX
)
4866 call
= gen_rtx_SET (result
, call
);
4871 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
4873 vec
= gen_rtvec (2, call
, tmp
);
4874 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
4876 aarch64_emit_call_insn (call
);
4879 /* Emit call insn with PAT and do aarch64-specific handling. */
4882 aarch64_emit_call_insn (rtx pat
)
4884 rtx insn
= emit_call_insn (pat
);
4886 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
4887 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
4888 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
4892 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
4894 /* All floating point compares return CCFP if it is an equality
4895 comparison, and CCFPE otherwise. */
4896 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
4923 /* Equality comparisons of short modes against zero can be performed
4924 using the TST instruction with the appropriate bitmask. */
4925 if (y
== const0_rtx
&& REG_P (x
)
4926 && (code
== EQ
|| code
== NE
)
4927 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
4930 /* Similarly, comparisons of zero_extends from shorter modes can
4931 be performed using an ANDS with an immediate mask. */
4932 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
4933 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4934 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
4935 && (code
== EQ
|| code
== NE
))
4938 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4940 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
4941 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
4942 || GET_CODE (x
) == NEG
4943 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
4944 && CONST_INT_P (XEXP (x
, 2)))))
4947 /* A compare with a shifted operand. Because of canonicalization,
4948 the comparison will have to be swapped when we emit the assembly
4950 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4951 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
4952 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
4953 || GET_CODE (x
) == LSHIFTRT
4954 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
4957 /* Similarly for a negated operand, but we can only do this for
4959 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4960 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
4961 && (code
== EQ
|| code
== NE
)
4962 && GET_CODE (x
) == NEG
)
4965 /* A test for unsigned overflow. */
4966 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
4968 && GET_CODE (x
) == PLUS
4969 && GET_CODE (y
) == ZERO_EXTEND
)
4972 /* For everything else, return CCmode. */
4977 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
4980 aarch64_get_condition_code (rtx x
)
4982 machine_mode mode
= GET_MODE (XEXP (x
, 0));
4983 enum rtx_code comp_code
= GET_CODE (x
);
4985 if (GET_MODE_CLASS (mode
) != MODE_CC
)
4986 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
4987 return aarch64_get_condition_code_1 (mode
, comp_code
);
4991 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
4999 case GE
: return AARCH64_GE
;
5000 case GT
: return AARCH64_GT
;
5001 case LE
: return AARCH64_LS
;
5002 case LT
: return AARCH64_MI
;
5003 case NE
: return AARCH64_NE
;
5004 case EQ
: return AARCH64_EQ
;
5005 case ORDERED
: return AARCH64_VC
;
5006 case UNORDERED
: return AARCH64_VS
;
5007 case UNLT
: return AARCH64_LT
;
5008 case UNLE
: return AARCH64_LE
;
5009 case UNGT
: return AARCH64_HI
;
5010 case UNGE
: return AARCH64_PL
;
5018 case NE
: return AARCH64_NE
;
5019 case EQ
: return AARCH64_EQ
;
5020 case GE
: return AARCH64_GE
;
5021 case GT
: return AARCH64_GT
;
5022 case LE
: return AARCH64_LE
;
5023 case LT
: return AARCH64_LT
;
5024 case GEU
: return AARCH64_CS
;
5025 case GTU
: return AARCH64_HI
;
5026 case LEU
: return AARCH64_LS
;
5027 case LTU
: return AARCH64_CC
;
5035 case NE
: return AARCH64_NE
;
5036 case EQ
: return AARCH64_EQ
;
5037 case GE
: return AARCH64_LE
;
5038 case GT
: return AARCH64_LT
;
5039 case LE
: return AARCH64_GE
;
5040 case LT
: return AARCH64_GT
;
5041 case GEU
: return AARCH64_LS
;
5042 case GTU
: return AARCH64_CC
;
5043 case LEU
: return AARCH64_CS
;
5044 case LTU
: return AARCH64_HI
;
5052 case NE
: return AARCH64_NE
;
5053 case EQ
: return AARCH64_EQ
;
5054 case GE
: return AARCH64_PL
;
5055 case LT
: return AARCH64_MI
;
5063 case NE
: return AARCH64_NE
;
5064 case EQ
: return AARCH64_EQ
;
5072 case NE
: return AARCH64_CS
;
5073 case EQ
: return AARCH64_CC
;
5086 aarch64_const_vec_all_same_in_range_p (rtx x
,
5087 HOST_WIDE_INT minval
,
5088 HOST_WIDE_INT maxval
)
5090 HOST_WIDE_INT firstval
;
5093 if (GET_CODE (x
) != CONST_VECTOR
5094 || GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_INT
)
5097 firstval
= INTVAL (CONST_VECTOR_ELT (x
, 0));
5098 if (firstval
< minval
|| firstval
> maxval
)
5101 count
= CONST_VECTOR_NUNITS (x
);
5102 for (i
= 1; i
< count
; i
++)
5103 if (INTVAL (CONST_VECTOR_ELT (x
, i
)) != firstval
)
5110 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
5112 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
5117 #define AARCH64_CC_V 1
5118 #define AARCH64_CC_C (1 << 1)
5119 #define AARCH64_CC_Z (1 << 2)
5120 #define AARCH64_CC_N (1 << 3)
5122 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5123 static const int aarch64_nzcv_codes
[] =
5125 0, /* EQ, Z == 1. */
5126 AARCH64_CC_Z
, /* NE, Z == 0. */
5127 0, /* CS, C == 1. */
5128 AARCH64_CC_C
, /* CC, C == 0. */
5129 0, /* MI, N == 1. */
5130 AARCH64_CC_N
, /* PL, N == 0. */
5131 0, /* VS, V == 1. */
5132 AARCH64_CC_V
, /* VC, V == 0. */
5133 0, /* HI, C ==1 && Z == 0. */
5134 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
5135 AARCH64_CC_V
, /* GE, N == V. */
5136 0, /* LT, N != V. */
5137 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
5138 0, /* LE, !(Z == 0 && N == V). */
5143 /* Print operand X to file F in a target specific manner according to CODE.
5144 The acceptable formatting commands given by CODE are:
5145 'c': An integer or symbol address without a preceding #
5147 'e': Print the sign/zero-extend size as a character 8->b,
5149 'p': Prints N such that 2^N == X (X must be power of 2 and
5151 'P': Print the number of non-zero bits in X (a const_int).
5152 'H': Print the higher numbered register of a pair (TImode)
5154 'm': Print a condition (eq, ne, etc).
5155 'M': Same as 'm', but invert condition.
5156 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5157 'S/T/U/V': Print a FP/SIMD register name for a register list.
5158 The register printed is the FP/SIMD register name
5159 of X + 0/1/2/3 for S/T/U/V.
5160 'R': Print a scalar FP/SIMD register name + 1.
5161 'X': Print bottom 16 bits of integer constant in hex.
5162 'w/x': Print a general register name or the zero register
5164 '0': Print a normal operand, if it's a general register,
5165 then we assume DImode.
5166 'k': Print NZCV for conditional compare instructions.
5167 'A': Output address constant representing the first
5168 argument of X, specifying a relocation offset
5170 'L': Output constant address specified by X
5171 with a relocation offset if appropriate.
5172 'G': Prints address of X, specifying a PC relative
5173 relocation mode if appropriate. */
5176 aarch64_print_operand (FILE *f
, rtx x
, int code
)
5181 switch (GET_CODE (x
))
5184 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
5188 output_addr_const (f
, x
);
5192 if (GET_CODE (XEXP (x
, 0)) == PLUS
5193 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
5195 output_addr_const (f
, x
);
5201 output_operand_lossage ("Unsupported operand for code '%c'", code
);
5209 if (!CONST_INT_P (x
)
5210 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
5212 output_operand_lossage ("invalid operand for '%%%c'", code
);
5228 output_operand_lossage ("invalid operand for '%%%c'", code
);
5238 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
5240 output_operand_lossage ("invalid operand for '%%%c'", code
);
5244 asm_fprintf (f
, "%d", n
);
5249 if (!CONST_INT_P (x
))
5251 output_operand_lossage ("invalid operand for '%%%c'", code
);
5255 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
5259 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
5261 output_operand_lossage ("invalid operand for '%%%c'", code
);
5265 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
5272 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5273 if (x
== const_true_rtx
)
5280 if (!COMPARISON_P (x
))
5282 output_operand_lossage ("invalid operand for '%%%c'", code
);
5286 cond_code
= aarch64_get_condition_code (x
);
5287 gcc_assert (cond_code
>= 0);
5289 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
5290 fputs (aarch64_condition_codes
[cond_code
], f
);
5299 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5301 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5304 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
5311 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5313 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5316 asm_fprintf (f
, "v%d", REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
5320 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5322 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5325 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
5329 if (!CONST_INT_P (x
))
5331 output_operand_lossage ("invalid operand for '%%%c'", code
);
5334 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
5340 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
5342 asm_fprintf (f
, "%czr", code
);
5346 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
5348 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
5352 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
5354 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
5363 output_operand_lossage ("missing operand");
5367 switch (GET_CODE (x
))
5370 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
5374 output_address (GET_MODE (x
), XEXP (x
, 0));
5375 /* Check all memory references are Pmode - even with ILP32. */
5376 gcc_assert (GET_MODE (XEXP (x
, 0)) == Pmode
);
5382 output_addr_const (asm_out_file
, x
);
5386 asm_fprintf (f
, "%wd", INTVAL (x
));
5390 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
5393 aarch64_const_vec_all_same_in_range_p (x
,
5395 HOST_WIDE_INT_MAX
));
5396 asm_fprintf (f
, "%wd", INTVAL (CONST_VECTOR_ELT (x
, 0)));
5398 else if (aarch64_simd_imm_zero_p (x
, GET_MODE (x
)))
5407 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5408 be getting CONST_DOUBLEs holding integers. */
5409 gcc_assert (GET_MODE (x
) != VOIDmode
);
5410 if (aarch64_float_const_zero_rtx_p (x
))
5415 else if (aarch64_float_const_representable_p (x
))
5418 char float_buf
[buf_size
] = {'\0'};
5419 real_to_decimal_for_mode (float_buf
,
5420 CONST_DOUBLE_REAL_VALUE (x
),
5423 asm_fprintf (asm_out_file
, "%s", float_buf
);
5427 output_operand_lossage ("invalid constant");
5430 output_operand_lossage ("invalid operand");
5436 if (GET_CODE (x
) == HIGH
)
5439 switch (aarch64_classify_symbolic_expression (x
))
5441 case SYMBOL_SMALL_GOT_4G
:
5442 asm_fprintf (asm_out_file
, ":got:");
5445 case SYMBOL_SMALL_TLSGD
:
5446 asm_fprintf (asm_out_file
, ":tlsgd:");
5449 case SYMBOL_SMALL_TLSDESC
:
5450 asm_fprintf (asm_out_file
, ":tlsdesc:");
5453 case SYMBOL_SMALL_TLSIE
:
5454 asm_fprintf (asm_out_file
, ":gottprel:");
5457 case SYMBOL_TLSLE24
:
5458 asm_fprintf (asm_out_file
, ":tprel:");
5461 case SYMBOL_TINY_GOT
:
5468 output_addr_const (asm_out_file
, x
);
5472 switch (aarch64_classify_symbolic_expression (x
))
5474 case SYMBOL_SMALL_GOT_4G
:
5475 asm_fprintf (asm_out_file
, ":lo12:");
5478 case SYMBOL_SMALL_TLSGD
:
5479 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
5482 case SYMBOL_SMALL_TLSDESC
:
5483 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
5486 case SYMBOL_SMALL_TLSIE
:
5487 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
5490 case SYMBOL_TLSLE12
:
5491 asm_fprintf (asm_out_file
, ":tprel_lo12:");
5494 case SYMBOL_TLSLE24
:
5495 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
5498 case SYMBOL_TINY_GOT
:
5499 asm_fprintf (asm_out_file
, ":got:");
5502 case SYMBOL_TINY_TLSIE
:
5503 asm_fprintf (asm_out_file
, ":gottprel:");
5509 output_addr_const (asm_out_file
, x
);
5513 switch (aarch64_classify_symbolic_expression (x
))
5515 case SYMBOL_TLSLE24
:
5516 asm_fprintf (asm_out_file
, ":tprel_hi12:");
5521 output_addr_const (asm_out_file
, x
);
5526 HOST_WIDE_INT cond_code
;
5528 if (!CONST_INT_P (x
))
5530 output_operand_lossage ("invalid operand for '%%%c'", code
);
5534 cond_code
= INTVAL (x
);
5535 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
5536 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
5541 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
5547 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
5549 struct aarch64_address_info addr
;
5551 if (aarch64_classify_address (&addr
, x
, mode
, MEM
, true))
5554 case ADDRESS_REG_IMM
:
5555 if (addr
.offset
== const0_rtx
)
5556 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
5558 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
5559 INTVAL (addr
.offset
));
5562 case ADDRESS_REG_REG
:
5563 if (addr
.shift
== 0)
5564 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
5565 reg_names
[REGNO (addr
.offset
)]);
5567 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
5568 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
5571 case ADDRESS_REG_UXTW
:
5572 if (addr
.shift
== 0)
5573 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
5574 REGNO (addr
.offset
) - R0_REGNUM
);
5576 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
5577 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5580 case ADDRESS_REG_SXTW
:
5581 if (addr
.shift
== 0)
5582 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
5583 REGNO (addr
.offset
) - R0_REGNUM
);
5585 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
5586 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5589 case ADDRESS_REG_WB
:
5590 switch (GET_CODE (x
))
5593 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)],
5594 GET_MODE_SIZE (mode
));
5597 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)],
5598 GET_MODE_SIZE (mode
));
5601 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)],
5602 GET_MODE_SIZE (mode
));
5605 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)],
5606 GET_MODE_SIZE (mode
));
5609 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
5610 INTVAL (addr
.offset
));
5613 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
5614 INTVAL (addr
.offset
));
5621 case ADDRESS_LO_SUM
:
5622 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
5623 output_addr_const (f
, addr
.offset
);
5624 asm_fprintf (f
, "]");
5627 case ADDRESS_SYMBOLIC
:
5631 output_addr_const (f
, x
);
5635 aarch64_label_mentioned_p (rtx x
)
5640 if (GET_CODE (x
) == LABEL_REF
)
5643 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5644 referencing instruction, but they are constant offsets, not
5646 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5649 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
5650 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
5656 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
5657 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
5660 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
5667 /* Implement REGNO_REG_CLASS. */
5670 aarch64_regno_regclass (unsigned regno
)
5672 if (GP_REGNUM_P (regno
))
5673 return GENERAL_REGS
;
5675 if (regno
== SP_REGNUM
)
5678 if (regno
== FRAME_POINTER_REGNUM
5679 || regno
== ARG_POINTER_REGNUM
)
5680 return POINTER_REGS
;
5682 if (FP_REGNUM_P (regno
))
5683 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
5689 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
5691 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5692 where mask is selected by alignment and size of the offset.
5693 We try to pick as large a range for the offset as possible to
5694 maximize the chance of a CSE. However, for aligned addresses
5695 we limit the range to 4k so that structures with different sized
5696 elements are likely to use the same base. We need to be careful
5697 not to split a CONST for some forms of address expression, otherwise
5698 it will generate sub-optimal code. */
5700 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
5702 rtx base
= XEXP (x
, 0);
5703 rtx offset_rtx
= XEXP (x
, 1);
5704 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
5706 if (GET_CODE (base
) == PLUS
)
5708 rtx op0
= XEXP (base
, 0);
5709 rtx op1
= XEXP (base
, 1);
5711 /* Force any scaling into a temp for CSE. */
5712 op0
= force_reg (Pmode
, op0
);
5713 op1
= force_reg (Pmode
, op1
);
5715 /* Let the pointer register be in op0. */
5716 if (REG_POINTER (op1
))
5717 std::swap (op0
, op1
);
5719 /* If the pointer is virtual or frame related, then we know that
5720 virtual register instantiation or register elimination is going
5721 to apply a second constant. We want the two constants folded
5722 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5723 if (virt_or_elim_regno_p (REGNO (op0
)))
5725 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
5726 NULL_RTX
, true, OPTAB_DIRECT
);
5727 return gen_rtx_PLUS (Pmode
, base
, op1
);
5730 /* Otherwise, in order to encourage CSE (and thence loop strength
5731 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5732 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
5733 NULL_RTX
, true, OPTAB_DIRECT
);
5734 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
5737 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5738 HOST_WIDE_INT base_offset
;
5739 if (GET_MODE_SIZE (mode
) > 16)
5740 base_offset
= (offset
+ 0x400) & ~0x7f0;
5741 /* For offsets aren't a multiple of the access size, the limit is
5743 else if (offset
& (GET_MODE_SIZE (mode
) - 1))
5745 base_offset
= (offset
+ 0x100) & ~0x1ff;
5747 /* BLKmode typically uses LDP of X-registers. */
5748 if (mode
== BLKmode
)
5749 base_offset
= (offset
+ 512) & ~0x3ff;
5751 /* Small negative offsets are supported. */
5752 else if (IN_RANGE (offset
, -256, 0))
5754 else if (mode
== TImode
|| mode
== TFmode
)
5755 base_offset
= (offset
+ 0x100) & ~0x1ff;
5756 /* Use 12-bit offset by access size. */
5758 base_offset
= offset
& (~0xfff * GET_MODE_SIZE (mode
));
5760 if (base_offset
!= 0)
5762 base
= plus_constant (Pmode
, base
, base_offset
);
5763 base
= force_operand (base
, NULL_RTX
);
5764 return plus_constant (Pmode
, base
, offset
- base_offset
);
5771 /* Return the reload icode required for a constant pool in mode. */
5772 static enum insn_code
5773 aarch64_constant_pool_reload_icode (machine_mode mode
)
5778 return CODE_FOR_aarch64_reload_movcpsfdi
;
5781 return CODE_FOR_aarch64_reload_movcpdfdi
;
5784 return CODE_FOR_aarch64_reload_movcptfdi
;
5787 return CODE_FOR_aarch64_reload_movcpv8qidi
;
5790 return CODE_FOR_aarch64_reload_movcpv16qidi
;
5793 return CODE_FOR_aarch64_reload_movcpv4hidi
;
5796 return CODE_FOR_aarch64_reload_movcpv8hidi
;
5799 return CODE_FOR_aarch64_reload_movcpv2sidi
;
5802 return CODE_FOR_aarch64_reload_movcpv4sidi
;
5805 return CODE_FOR_aarch64_reload_movcpv2didi
;
5808 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
5817 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
5820 secondary_reload_info
*sri
)
5823 /* If we have to disable direct literal pool loads and stores because the
5824 function is too big, then we need a scratch register. */
5825 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
5826 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
5827 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
5828 && !aarch64_pcrelative_literal_loads
)
5830 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
5834 /* Without the TARGET_SIMD instructions we cannot move a Q register
5835 to a Q register directly. We need a scratch. */
5836 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
5837 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
5838 && reg_class_subset_p (rclass
, FP_REGS
))
5841 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
5842 else if (mode
== TImode
)
5843 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
5847 /* A TFmode or TImode memory access should be handled via an FP_REGS
5848 because AArch64 has richer addressing modes for LDR/STR instructions
5849 than LDP/STP instructions. */
5850 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
5851 && GET_MODE_SIZE (mode
) == 16 && MEM_P (x
))
5854 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
5855 return GENERAL_REGS
;
5861 aarch64_can_eliminate (const int from
, const int to
)
5863 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5864 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5866 if (frame_pointer_needed
)
5868 if (from
== ARG_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5870 if (from
== ARG_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
)
5872 if (from
== FRAME_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
5873 && !cfun
->calls_alloca
)
5875 if (from
== FRAME_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5882 /* If we decided that we didn't need a leaf frame pointer but then used
5883 LR in the function, then we'll want a frame pointer after all, so
5884 prevent this elimination to ensure a frame pointer is used. */
5885 if (to
== STACK_POINTER_REGNUM
5886 && flag_omit_leaf_frame_pointer
5887 && df_regs_ever_live_p (LR_REGNUM
))
5895 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
5897 aarch64_layout_frame ();
5899 if (to
== HARD_FRAME_POINTER_REGNUM
)
5901 if (from
== ARG_POINTER_REGNUM
)
5902 return cfun
->machine
->frame
.hard_fp_offset
;
5904 if (from
== FRAME_POINTER_REGNUM
)
5905 return cfun
->machine
->frame
.hard_fp_offset
5906 - cfun
->machine
->frame
.locals_offset
;
5909 if (to
== STACK_POINTER_REGNUM
)
5911 if (from
== FRAME_POINTER_REGNUM
)
5912 return cfun
->machine
->frame
.frame_size
5913 - cfun
->machine
->frame
.locals_offset
;
5916 return cfun
->machine
->frame
.frame_size
;
5919 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5923 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
5927 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
5932 aarch64_asm_trampoline_template (FILE *f
)
5936 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
5937 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
5941 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
5942 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
5944 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
5945 assemble_aligned_integer (4, const0_rtx
);
5946 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5947 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5951 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
5953 rtx fnaddr
, mem
, a_tramp
;
5954 const int tramp_code_sz
= 16;
5956 /* Don't need to copy the trailing D-words, we fill those in below. */
5957 emit_block_move (m_tramp
, assemble_trampoline_template (),
5958 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
5959 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
5960 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
5961 if (GET_MODE (fnaddr
) != ptr_mode
)
5962 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
5963 emit_move_insn (mem
, fnaddr
);
5965 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
5966 emit_move_insn (mem
, chain_value
);
5968 /* XXX We should really define a "clear_cache" pattern and use
5969 gen_clear_cache(). */
5970 a_tramp
= XEXP (m_tramp
, 0);
5971 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
5972 LCT_NORMAL
, VOIDmode
, 2, a_tramp
, ptr_mode
,
5973 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
5977 static unsigned char
5978 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
5982 case CALLER_SAVE_REGS
:
5989 aarch64_vector_mode_p (mode
)
5990 ? (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
5991 : (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
6005 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
6007 if (regclass
== POINTER_REGS
)
6008 return GENERAL_REGS
;
6010 if (regclass
== STACK_REG
)
6013 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
6019 /* Register eliminiation can result in a request for
6020 SP+constant->FP_REGS. We cannot support such operations which
6021 use SP as source and an FP_REG as destination, so reject out
6023 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
6025 rtx lhs
= XEXP (x
, 0);
6027 /* Look through a possible SUBREG introduced by ILP32. */
6028 if (GET_CODE (lhs
) == SUBREG
)
6029 lhs
= SUBREG_REG (lhs
);
6031 gcc_assert (REG_P (lhs
));
6032 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
6041 aarch64_asm_output_labelref (FILE* f
, const char *name
)
6043 asm_fprintf (f
, "%U%s", name
);
6047 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
6049 if (priority
== DEFAULT_INIT_PRIORITY
)
6050 default_ctor_section_asm_out_constructor (symbol
, priority
);
6054 /* While priority is known to be in range [0, 65535], so 18 bytes
6055 would be enough, the compiler might not know that. To avoid
6056 -Wformat-truncation false positive, use a larger size. */
6058 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
6059 s
= get_section (buf
, SECTION_WRITE
, NULL
);
6060 switch_to_section (s
);
6061 assemble_align (POINTER_SIZE
);
6062 assemble_aligned_integer (POINTER_BYTES
, symbol
);
6067 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
6069 if (priority
== DEFAULT_INIT_PRIORITY
)
6070 default_dtor_section_asm_out_destructor (symbol
, priority
);
6074 /* While priority is known to be in range [0, 65535], so 18 bytes
6075 would be enough, the compiler might not know that. To avoid
6076 -Wformat-truncation false positive, use a larger size. */
6078 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
6079 s
= get_section (buf
, SECTION_WRITE
, NULL
);
6080 switch_to_section (s
);
6081 assemble_align (POINTER_SIZE
);
6082 assemble_aligned_integer (POINTER_BYTES
, symbol
);
6087 aarch64_output_casesi (rtx
*operands
)
6091 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
6093 static const char *const patterns
[4][2] =
6096 "ldrb\t%w3, [%0,%w1,uxtw]",
6097 "add\t%3, %4, %w3, sxtb #2"
6100 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6101 "add\t%3, %4, %w3, sxth #2"
6104 "ldr\t%w3, [%0,%w1,uxtw #2]",
6105 "add\t%3, %4, %w3, sxtw #2"
6107 /* We assume that DImode is only generated when not optimizing and
6108 that we don't really need 64-bit address offsets. That would
6109 imply an object file with 8GB of code in a single function! */
6111 "ldr\t%w3, [%0,%w1,uxtw #2]",
6112 "add\t%3, %4, %w3, sxtw #2"
6116 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
6118 index
= exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec
)));
6120 gcc_assert (index
>= 0 && index
<= 3);
6122 /* Need to implement table size reduction, by chaning the code below. */
6123 output_asm_insn (patterns
[index
][0], operands
);
6124 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
6125 snprintf (buf
, sizeof (buf
),
6126 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
6127 output_asm_insn (buf
, operands
);
6128 output_asm_insn (patterns
[index
][1], operands
);
6129 output_asm_insn ("br\t%3", operands
);
6130 assemble_label (asm_out_file
, label
);
6135 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6136 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6140 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
6142 if (shift
>= 0 && shift
<= 3)
6145 for (size
= 8; size
<= 32; size
*= 2)
6147 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
6148 if (mask
== bits
<< shift
)
6155 /* Constant pools are per function only when PC relative
6156 literal loads are true or we are in the large memory
6160 aarch64_can_use_per_function_literal_pools_p (void)
6162 return (aarch64_pcrelative_literal_loads
6163 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
6167 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
6169 /* Fixme:: In an ideal world this would work similar
6170 to the logic in aarch64_select_rtx_section but this
6171 breaks bootstrap in gcc go. For now we workaround
6172 this by returning false here. */
6176 /* Select appropriate section for constants depending
6177 on where we place literal pools. */
6180 aarch64_select_rtx_section (machine_mode mode
,
6182 unsigned HOST_WIDE_INT align
)
6184 if (aarch64_can_use_per_function_literal_pools_p ())
6185 return function_section (current_function_decl
);
6187 return default_elf_select_rtx_section (mode
, x
, align
);
6190 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6192 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
6193 HOST_WIDE_INT offset
)
6195 /* When using per-function literal pools, we must ensure that any code
6196 section is aligned to the minimal instruction length, lest we get
6197 errors from the assembler re "unaligned instructions". */
6198 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
6199 ASM_OUTPUT_ALIGN (f
, 2);
6204 /* Helper function for rtx cost calculation. Strip a shift expression
6205 from X. Returns the inner operand if successful, or the original
6206 expression on failure. */
6208 aarch64_strip_shift (rtx x
)
6212 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6213 we can convert both to ROR during final output. */
6214 if ((GET_CODE (op
) == ASHIFT
6215 || GET_CODE (op
) == ASHIFTRT
6216 || GET_CODE (op
) == LSHIFTRT
6217 || GET_CODE (op
) == ROTATERT
6218 || GET_CODE (op
) == ROTATE
)
6219 && CONST_INT_P (XEXP (op
, 1)))
6220 return XEXP (op
, 0);
6222 if (GET_CODE (op
) == MULT
6223 && CONST_INT_P (XEXP (op
, 1))
6224 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
6225 return XEXP (op
, 0);
6230 /* Helper function for rtx cost calculation. Strip an extend
6231 expression from X. Returns the inner operand if successful, or the
6232 original expression on failure. We deal with a number of possible
6233 canonicalization variations here. If STRIP_SHIFT is true, then
6234 we can strip off a shift also. */
6236 aarch64_strip_extend (rtx x
, bool strip_shift
)
6240 /* Zero and sign extraction of a widened value. */
6241 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
6242 && XEXP (op
, 2) == const0_rtx
6243 && GET_CODE (XEXP (op
, 0)) == MULT
6244 && aarch64_is_extend_from_extract (GET_MODE (op
), XEXP (XEXP (op
, 0), 1),
6246 return XEXP (XEXP (op
, 0), 0);
6248 /* It can also be represented (for zero-extend) as an AND with an
6250 if (GET_CODE (op
) == AND
6251 && GET_CODE (XEXP (op
, 0)) == MULT
6252 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
6253 && CONST_INT_P (XEXP (op
, 1))
6254 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
6255 INTVAL (XEXP (op
, 1))) != 0)
6256 return XEXP (XEXP (op
, 0), 0);
6258 /* Now handle extended register, as this may also have an optional
6259 left shift by 1..4. */
6261 && GET_CODE (op
) == ASHIFT
6262 && CONST_INT_P (XEXP (op
, 1))
6263 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
6266 if (GET_CODE (op
) == ZERO_EXTEND
6267 || GET_CODE (op
) == SIGN_EXTEND
)
6276 /* Return true iff CODE is a shift supported in combination
6277 with arithmetic instructions. */
6280 aarch64_shift_p (enum rtx_code code
)
6282 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
6286 /* Return true iff X is a cheap shift without a sign extend. */
6289 aarch64_cheap_mult_shift_p (rtx x
)
6296 if (!(aarch64_tune_params
.extra_tuning_flags
6297 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
6300 if (GET_CODE (op0
) == SIGN_EXTEND
)
6303 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
6304 && UINTVAL (op1
) <= 4)
6307 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
6310 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
6312 if (l2
> 0 && l2
<= 4)
6318 /* Helper function for rtx cost calculation. Calculate the cost of
6319 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6320 Return the calculated cost of the expression, recursing manually in to
6321 operands where needed. */
6324 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
6327 const struct cpu_cost_table
*extra_cost
6328 = aarch64_tune_params
.insn_extra_cost
;
6330 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
6331 machine_mode mode
= GET_MODE (x
);
6333 gcc_checking_assert (code
== MULT
);
6338 if (VECTOR_MODE_P (mode
))
6339 mode
= GET_MODE_INNER (mode
);
6341 /* Integer multiply/fma. */
6342 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6344 /* The multiply will be canonicalized as a shift, cost it as such. */
6345 if (aarch64_shift_p (GET_CODE (x
))
6346 || (CONST_INT_P (op1
)
6347 && exact_log2 (INTVAL (op1
)) > 0))
6349 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
6350 || GET_CODE (op0
) == SIGN_EXTEND
;
6355 /* If the shift is considered cheap,
6356 then don't add any cost. */
6357 if (aarch64_cheap_mult_shift_p (x
))
6359 else if (REG_P (op1
))
6360 /* ARITH + shift-by-register. */
6361 cost
+= extra_cost
->alu
.arith_shift_reg
;
6363 /* ARITH + extended register. We don't have a cost field
6364 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6365 cost
+= extra_cost
->alu
.extend_arith
;
6367 /* ARITH + shift-by-immediate. */
6368 cost
+= extra_cost
->alu
.arith_shift
;
6371 /* LSL (immediate). */
6372 cost
+= extra_cost
->alu
.shift
;
6375 /* Strip extends as we will have costed them in the case above. */
6377 op0
= aarch64_strip_extend (op0
, true);
6379 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
6384 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6385 compound and let the below cases handle it. After all, MNEG is a
6386 special-case alias of MSUB. */
6387 if (GET_CODE (op0
) == NEG
)
6389 op0
= XEXP (op0
, 0);
6393 /* Integer multiplies or FMAs have zero/sign extending variants. */
6394 if ((GET_CODE (op0
) == ZERO_EXTEND
6395 && GET_CODE (op1
) == ZERO_EXTEND
)
6396 || (GET_CODE (op0
) == SIGN_EXTEND
6397 && GET_CODE (op1
) == SIGN_EXTEND
))
6399 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
6400 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
6405 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6406 cost
+= extra_cost
->mult
[0].extend_add
;
6408 /* MUL/SMULL/UMULL. */
6409 cost
+= extra_cost
->mult
[0].extend
;
6415 /* This is either an integer multiply or a MADD. In both cases
6416 we want to recurse and cost the operands. */
6417 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
6418 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
6424 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
6427 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
6436 /* Floating-point FMA/FMUL can also support negations of the
6437 operands, unless the rounding mode is upward or downward in
6438 which case FNMUL is different than FMUL with operand negation. */
6439 bool neg0
= GET_CODE (op0
) == NEG
;
6440 bool neg1
= GET_CODE (op1
) == NEG
;
6441 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
6444 op0
= XEXP (op0
, 0);
6446 op1
= XEXP (op1
, 0);
6450 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6451 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
6454 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
6457 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
6458 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
6464 aarch64_address_cost (rtx x
,
6466 addr_space_t as ATTRIBUTE_UNUSED
,
6469 enum rtx_code c
= GET_CODE (x
);
6470 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
6471 struct aarch64_address_info info
;
6475 if (!aarch64_classify_address (&info
, x
, mode
, c
, false))
6477 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
6479 /* This is a CONST or SYMBOL ref which will be split
6480 in a different way depending on the code model in use.
6481 Cost it through the generic infrastructure. */
6482 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
6483 /* Divide through by the cost of one instruction to
6484 bring it to the same units as the address costs. */
6485 cost_symbol_ref
/= COSTS_N_INSNS (1);
6486 /* The cost is then the cost of preparing the address,
6487 followed by an immediate (possibly 0) offset. */
6488 return cost_symbol_ref
+ addr_cost
->imm_offset
;
6492 /* This is most likely a jump table from a case
6494 return addr_cost
->register_offset
;
6500 case ADDRESS_LO_SUM
:
6501 case ADDRESS_SYMBOLIC
:
6502 case ADDRESS_REG_IMM
:
6503 cost
+= addr_cost
->imm_offset
;
6506 case ADDRESS_REG_WB
:
6507 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
6508 cost
+= addr_cost
->pre_modify
;
6509 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
6510 cost
+= addr_cost
->post_modify
;
6516 case ADDRESS_REG_REG
:
6517 cost
+= addr_cost
->register_offset
;
6520 case ADDRESS_REG_SXTW
:
6521 cost
+= addr_cost
->register_sextend
;
6524 case ADDRESS_REG_UXTW
:
6525 cost
+= addr_cost
->register_zextend
;
6535 /* For the sake of calculating the cost of the shifted register
6536 component, we can treat same sized modes in the same way. */
6537 switch (GET_MODE_BITSIZE (mode
))
6540 cost
+= addr_cost
->addr_scale_costs
.hi
;
6544 cost
+= addr_cost
->addr_scale_costs
.si
;
6548 cost
+= addr_cost
->addr_scale_costs
.di
;
6551 /* We can't tell, or this is a 128-bit vector. */
6553 cost
+= addr_cost
->addr_scale_costs
.ti
;
6561 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6562 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6566 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
6568 /* When optimizing for speed, use the cost of unpredictable branches. */
6569 const struct cpu_branch_cost
*branch_costs
=
6570 aarch64_tune_params
.branch_costs
;
6572 if (!speed_p
|| predictable_p
)
6573 return branch_costs
->predictable
;
6575 return branch_costs
->unpredictable
;
6578 /* Return true if the RTX X in mode MODE is a zero or sign extract
6579 usable in an ADD or SUB (extended register) instruction. */
6581 aarch64_rtx_arith_op_extract_p (rtx x
, machine_mode mode
)
6583 /* Catch add with a sign extract.
6584 This is add_<optab><mode>_multp2. */
6585 if (GET_CODE (x
) == SIGN_EXTRACT
6586 || GET_CODE (x
) == ZERO_EXTRACT
)
6588 rtx op0
= XEXP (x
, 0);
6589 rtx op1
= XEXP (x
, 1);
6590 rtx op2
= XEXP (x
, 2);
6592 if (GET_CODE (op0
) == MULT
6593 && CONST_INT_P (op1
)
6594 && op2
== const0_rtx
6595 && CONST_INT_P (XEXP (op0
, 1))
6596 && aarch64_is_extend_from_extract (mode
,
6603 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6605 else if (GET_CODE (x
) == SIGN_EXTEND
6606 || GET_CODE (x
) == ZERO_EXTEND
)
6607 return REG_P (XEXP (x
, 0));
6613 aarch64_frint_unspec_p (unsigned int u
)
6631 /* Return true iff X is an rtx that will match an extr instruction
6632 i.e. as described in the *extr<mode>5_insn family of patterns.
6633 OP0 and OP1 will be set to the operands of the shifts involved
6634 on success and will be NULL_RTX otherwise. */
6637 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
6640 machine_mode mode
= GET_MODE (x
);
6642 *res_op0
= NULL_RTX
;
6643 *res_op1
= NULL_RTX
;
6645 if (GET_CODE (x
) != IOR
)
6651 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
6652 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
6654 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6655 if (GET_CODE (op1
) == ASHIFT
)
6656 std::swap (op0
, op1
);
6658 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
6661 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
6662 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
6664 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
6665 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
6667 *res_op0
= XEXP (op0
, 0);
6668 *res_op1
= XEXP (op1
, 0);
6676 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6677 storing it in *COST. Result is true if the total cost of the operation
6678 has now been calculated. */
6680 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
6684 enum rtx_code cmpcode
;
6686 if (COMPARISON_P (op0
))
6688 inner
= XEXP (op0
, 0);
6689 comparator
= XEXP (op0
, 1);
6690 cmpcode
= GET_CODE (op0
);
6695 comparator
= const0_rtx
;
6699 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
6701 /* Conditional branch. */
6702 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6706 if (cmpcode
== NE
|| cmpcode
== EQ
)
6708 if (comparator
== const0_rtx
)
6710 /* TBZ/TBNZ/CBZ/CBNZ. */
6711 if (GET_CODE (inner
) == ZERO_EXTRACT
)
6713 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
6714 ZERO_EXTRACT
, 0, speed
);
6717 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
6722 else if (cmpcode
== LT
|| cmpcode
== GE
)
6725 if (comparator
== const0_rtx
)
6730 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6733 if (GET_CODE (op1
) == COMPARE
)
6735 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6736 if (XEXP (op1
, 1) == const0_rtx
)
6740 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
6741 const struct cpu_cost_table
*extra_cost
6742 = aarch64_tune_params
.insn_extra_cost
;
6744 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6745 *cost
+= extra_cost
->alu
.arith
;
6747 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
6752 /* It's a conditional operation based on the status flags,
6753 so it must be some flavor of CSEL. */
6755 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6756 if (GET_CODE (op1
) == NEG
6757 || GET_CODE (op1
) == NOT
6758 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
6759 op1
= XEXP (op1
, 0);
6760 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
6762 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6763 op1
= XEXP (op1
, 0);
6764 op2
= XEXP (op2
, 0);
6767 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
6768 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
6772 /* We don't know what this is, cost all operands. */
6776 /* Check whether X is a bitfield operation of the form shift + extend that
6777 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6778 operand to which the bitfield operation is applied. Otherwise return
6782 aarch64_extend_bitfield_pattern_p (rtx x
)
6784 rtx_code outer_code
= GET_CODE (x
);
6785 machine_mode outer_mode
= GET_MODE (x
);
6787 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
6788 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
6791 rtx inner
= XEXP (x
, 0);
6792 rtx_code inner_code
= GET_CODE (inner
);
6793 machine_mode inner_mode
= GET_MODE (inner
);
6799 if (CONST_INT_P (XEXP (inner
, 1))
6800 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6801 op
= XEXP (inner
, 0);
6804 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6805 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6806 op
= XEXP (inner
, 0);
6809 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6810 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6811 op
= XEXP (inner
, 0);
6820 /* Return true if the mask and a shift amount from an RTX of the form
6821 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6822 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6825 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode
, rtx mask
, rtx shft_amnt
)
6827 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
6828 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
6829 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
6830 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
6833 /* Calculate the cost of calculating X, storing it in *COST. Result
6834 is true if the total cost of the operation has now been calculated. */
6836 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
6837 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
6840 const struct cpu_cost_table
*extra_cost
6841 = aarch64_tune_params
.insn_extra_cost
;
6842 int code
= GET_CODE (x
);
6844 /* By default, assume that everything has equivalent cost to the
6845 cheapest instruction. Any additional costs are applied as a delta
6846 above this default. */
6847 *cost
= COSTS_N_INSNS (1);
6852 /* The cost depends entirely on the operands to SET. */
6857 switch (GET_CODE (op0
))
6862 rtx address
= XEXP (op0
, 0);
6863 if (VECTOR_MODE_P (mode
))
6864 *cost
+= extra_cost
->ldst
.storev
;
6865 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6866 *cost
+= extra_cost
->ldst
.store
;
6867 else if (mode
== SFmode
)
6868 *cost
+= extra_cost
->ldst
.storef
;
6869 else if (mode
== DFmode
)
6870 *cost
+= extra_cost
->ldst
.stored
;
6873 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6877 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6881 if (! REG_P (SUBREG_REG (op0
)))
6882 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
6886 /* The cost is one per vector-register copied. */
6887 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
6889 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6890 / GET_MODE_SIZE (V4SImode
);
6891 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6893 /* const0_rtx is in general free, but we will use an
6894 instruction to set a register to 0. */
6895 else if (REG_P (op1
) || op1
== const0_rtx
)
6897 /* The cost is 1 per register copied. */
6898 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6900 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6903 /* Cost is just the cost of the RHS of the set. */
6904 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6909 /* Bit-field insertion. Strip any redundant widening of
6910 the RHS to meet the width of the target. */
6911 if (GET_CODE (op1
) == SUBREG
)
6912 op1
= SUBREG_REG (op1
);
6913 if ((GET_CODE (op1
) == ZERO_EXTEND
6914 || GET_CODE (op1
) == SIGN_EXTEND
)
6915 && CONST_INT_P (XEXP (op0
, 1))
6916 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1
, 0)))
6917 >= INTVAL (XEXP (op0
, 1))))
6918 op1
= XEXP (op1
, 0);
6920 if (CONST_INT_P (op1
))
6922 /* MOV immediate is assumed to always be cheap. */
6923 *cost
= COSTS_N_INSNS (1);
6929 *cost
+= extra_cost
->alu
.bfi
;
6930 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
6936 /* We can't make sense of this, assume default cost. */
6937 *cost
= COSTS_N_INSNS (1);
6943 /* If an instruction can incorporate a constant within the
6944 instruction, the instruction's expression avoids calling
6945 rtx_cost() on the constant. If rtx_cost() is called on a
6946 constant, then it is usually because the constant must be
6947 moved into a register by one or more instructions.
6949 The exception is constant 0, which can be expressed
6950 as XZR/WZR and is therefore free. The exception to this is
6951 if we have (set (reg) (const0_rtx)) in which case we must cost
6952 the move. However, we can catch that when we cost the SET, so
6953 we don't need to consider that here. */
6954 if (x
== const0_rtx
)
6958 /* To an approximation, building any other constant is
6959 proportionally expensive to the number of instructions
6960 required to build that constant. This is true whether we
6961 are compiling for SPEED or otherwise. */
6962 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
6963 (NULL_RTX
, x
, false, mode
));
6969 /* First determine number of instructions to do the move
6970 as an integer constant. */
6971 if (!aarch64_float_const_representable_p (x
)
6972 && !aarch64_can_const_movi_rtx_p (x
, mode
)
6973 && aarch64_float_const_rtx_p (x
))
6975 unsigned HOST_WIDE_INT ival
;
6976 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
6977 gcc_assert (succeed
);
6979 machine_mode imode
= mode
== HFmode
? SImode
6980 : int_mode_for_mode (mode
);
6981 int ncost
= aarch64_internal_mov_immediate
6982 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6983 *cost
+= COSTS_N_INSNS (ncost
);
6989 /* mov[df,sf]_aarch64. */
6990 if (aarch64_float_const_representable_p (x
))
6991 /* FMOV (scalar immediate). */
6992 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
6993 else if (!aarch64_float_const_zero_rtx_p (x
))
6995 /* This will be a load from memory. */
6997 *cost
+= extra_cost
->ldst
.loadd
;
6999 *cost
+= extra_cost
->ldst
.loadf
;
7002 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7003 or MOV v0.s[0], wzr - neither of which are modeled by the
7004 cost tables. Just use the default cost. */
7014 /* For loads we want the base cost of a load, plus an
7015 approximation for the additional cost of the addressing
7017 rtx address
= XEXP (x
, 0);
7018 if (VECTOR_MODE_P (mode
))
7019 *cost
+= extra_cost
->ldst
.loadv
;
7020 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7021 *cost
+= extra_cost
->ldst
.load
;
7022 else if (mode
== SFmode
)
7023 *cost
+= extra_cost
->ldst
.loadf
;
7024 else if (mode
== DFmode
)
7025 *cost
+= extra_cost
->ldst
.loadd
;
7028 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
7037 if (VECTOR_MODE_P (mode
))
7042 *cost
+= extra_cost
->vect
.alu
;
7047 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7049 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
7050 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
7053 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
7057 /* Cost this as SUB wzr, X. */
7058 op0
= CONST0_RTX (mode
);
7063 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7065 /* Support (neg(fma...)) as a single instruction only if
7066 sign of zeros is unimportant. This matches the decision
7067 making in aarch64.md. */
7068 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
7071 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
7074 if (GET_CODE (op0
) == MULT
)
7077 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
7082 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
7092 if (VECTOR_MODE_P (mode
))
7093 *cost
+= extra_cost
->vect
.alu
;
7095 *cost
+= extra_cost
->alu
.clz
;
7104 if (op1
== const0_rtx
7105 && GET_CODE (op0
) == AND
)
7108 mode
= GET_MODE (op0
);
7112 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
7114 /* TODO: A write to the CC flags possibly costs extra, this
7115 needs encoding in the cost tables. */
7117 mode
= GET_MODE (op0
);
7119 if (GET_CODE (op0
) == AND
)
7125 if (GET_CODE (op0
) == PLUS
)
7127 /* ADDS (and CMN alias). */
7132 if (GET_CODE (op0
) == MINUS
)
7139 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
7140 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
7141 && CONST_INT_P (XEXP (op0
, 2)))
7143 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7144 Handle it here directly rather than going to cost_logic
7145 since we know the immediate generated for the TST is valid
7146 so we can avoid creating an intermediate rtx for it only
7147 for costing purposes. */
7149 *cost
+= extra_cost
->alu
.logical
;
7151 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
7152 ZERO_EXTRACT
, 0, speed
);
7156 if (GET_CODE (op1
) == NEG
)
7160 *cost
+= extra_cost
->alu
.arith
;
7162 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
7163 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
7169 Compare can freely swap the order of operands, and
7170 canonicalization puts the more complex operation first.
7171 But the integer MINUS logic expects the shift/extend
7172 operation in op1. */
7174 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
7182 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
7186 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
7188 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
7190 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
7191 /* FCMP supports constant 0.0 for no extra cost. */
7197 if (VECTOR_MODE_P (mode
))
7199 /* Vector compare. */
7201 *cost
+= extra_cost
->vect
.alu
;
7203 if (aarch64_float_const_zero_rtx_p (op1
))
7205 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7219 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
7221 /* Detect valid immediates. */
7222 if ((GET_MODE_CLASS (mode
) == MODE_INT
7223 || (GET_MODE_CLASS (mode
) == MODE_CC
7224 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
7225 && CONST_INT_P (op1
)
7226 && aarch64_uimm12_shift (INTVAL (op1
)))
7229 /* SUB(S) (immediate). */
7230 *cost
+= extra_cost
->alu
.arith
;
7234 /* Look for SUB (extended register). */
7235 if (aarch64_rtx_arith_op_extract_p (op1
, mode
))
7238 *cost
+= extra_cost
->alu
.extend_arith
;
7240 op1
= aarch64_strip_extend (op1
, true);
7241 *cost
+= rtx_cost (op1
, VOIDmode
,
7242 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
7246 rtx new_op1
= aarch64_strip_extend (op1
, false);
7248 /* Cost this as an FMA-alike operation. */
7249 if ((GET_CODE (new_op1
) == MULT
7250 || aarch64_shift_p (GET_CODE (new_op1
)))
7253 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
7254 (enum rtx_code
) code
,
7259 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
7263 if (VECTOR_MODE_P (mode
))
7266 *cost
+= extra_cost
->vect
.alu
;
7268 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7271 *cost
+= extra_cost
->alu
.arith
;
7273 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7276 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7290 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
7291 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
7294 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
7295 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
7299 if (GET_MODE_CLASS (mode
) == MODE_INT
7300 && CONST_INT_P (op1
)
7301 && aarch64_uimm12_shift (INTVAL (op1
)))
7303 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
7306 /* ADD (immediate). */
7307 *cost
+= extra_cost
->alu
.arith
;
7311 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
7313 /* Look for ADD (extended register). */
7314 if (aarch64_rtx_arith_op_extract_p (op0
, mode
))
7317 *cost
+= extra_cost
->alu
.extend_arith
;
7319 op0
= aarch64_strip_extend (op0
, true);
7320 *cost
+= rtx_cost (op0
, VOIDmode
,
7321 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
7325 /* Strip any extend, leave shifts behind as we will
7326 cost them through mult_cost. */
7327 new_op0
= aarch64_strip_extend (op0
, false);
7329 if (GET_CODE (new_op0
) == MULT
7330 || aarch64_shift_p (GET_CODE (new_op0
)))
7332 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
7337 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
7341 if (VECTOR_MODE_P (mode
))
7344 *cost
+= extra_cost
->vect
.alu
;
7346 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7349 *cost
+= extra_cost
->alu
.arith
;
7351 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7354 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7361 *cost
= COSTS_N_INSNS (1);
7365 if (VECTOR_MODE_P (mode
))
7366 *cost
+= extra_cost
->vect
.alu
;
7368 *cost
+= extra_cost
->alu
.rev
;
7373 if (aarch_rev16_p (x
))
7375 *cost
= COSTS_N_INSNS (1);
7379 if (VECTOR_MODE_P (mode
))
7380 *cost
+= extra_cost
->vect
.alu
;
7382 *cost
+= extra_cost
->alu
.rev
;
7387 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
7389 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
7390 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
7392 *cost
+= extra_cost
->alu
.shift
;
7403 if (VECTOR_MODE_P (mode
))
7406 *cost
+= extra_cost
->vect
.alu
;
7411 && GET_CODE (op0
) == MULT
7412 && CONST_INT_P (XEXP (op0
, 1))
7413 && CONST_INT_P (op1
)
7414 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
7417 /* This is a UBFM/SBFM. */
7418 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
7420 *cost
+= extra_cost
->alu
.bfx
;
7424 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7426 if (CONST_INT_P (op1
))
7428 /* We have a mask + shift version of a UBFIZ
7429 i.e. the *andim_ashift<mode>_bfiz pattern. */
7430 if (GET_CODE (op0
) == ASHIFT
7431 && aarch64_mask_and_shift_for_ubfiz_p (mode
, op1
,
7434 *cost
+= rtx_cost (XEXP (op0
, 0), mode
,
7435 (enum rtx_code
) code
, 0, speed
);
7437 *cost
+= extra_cost
->alu
.bfx
;
7441 else if (aarch64_bitmask_imm (INTVAL (op1
), mode
))
7443 /* We possibly get the immediate for free, this is not
7445 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7447 *cost
+= extra_cost
->alu
.logical
;
7456 /* Handle ORN, EON, or BIC. */
7457 if (GET_CODE (op0
) == NOT
)
7458 op0
= XEXP (op0
, 0);
7460 new_op0
= aarch64_strip_shift (op0
);
7462 /* If we had a shift on op0 then this is a logical-shift-
7463 by-register/immediate operation. Otherwise, this is just
7464 a logical operation. */
7469 /* Shift by immediate. */
7470 if (CONST_INT_P (XEXP (op0
, 1)))
7471 *cost
+= extra_cost
->alu
.log_shift
;
7473 *cost
+= extra_cost
->alu
.log_shift_reg
;
7476 *cost
+= extra_cost
->alu
.logical
;
7479 /* In both cases we want to cost both operands. */
7480 *cost
+= rtx_cost (new_op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7481 *cost
+= rtx_cost (op1
, mode
, (enum rtx_code
) code
, 1, speed
);
7490 op0
= aarch64_strip_shift (x
);
7492 if (VECTOR_MODE_P (mode
))
7495 *cost
+= extra_cost
->vect
.alu
;
7499 /* MVN-shifted-reg. */
7502 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7505 *cost
+= extra_cost
->alu
.log_shift
;
7509 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7510 Handle the second form here taking care that 'a' in the above can
7512 else if (GET_CODE (op0
) == XOR
)
7514 rtx newop0
= XEXP (op0
, 0);
7515 rtx newop1
= XEXP (op0
, 1);
7516 rtx op0_stripped
= aarch64_strip_shift (newop0
);
7518 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
7519 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
7523 if (op0_stripped
!= newop0
)
7524 *cost
+= extra_cost
->alu
.log_shift
;
7526 *cost
+= extra_cost
->alu
.logical
;
7533 *cost
+= extra_cost
->alu
.logical
;
7540 /* If a value is written in SI mode, then zero extended to DI
7541 mode, the operation will in general be free as a write to
7542 a 'w' register implicitly zeroes the upper bits of an 'x'
7543 register. However, if this is
7545 (set (reg) (zero_extend (reg)))
7547 we must cost the explicit register move. */
7549 && GET_MODE (op0
) == SImode
7552 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
7554 /* If OP_COST is non-zero, then the cost of the zero extend
7555 is effectively the cost of the inner operation. Otherwise
7556 we have a MOV instruction and we take the cost from the MOV
7557 itself. This is true independently of whether we are
7558 optimizing for space or time. */
7564 else if (MEM_P (op0
))
7566 /* All loads can zero extend to any size for free. */
7567 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
7571 op0
= aarch64_extend_bitfield_pattern_p (x
);
7574 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
7576 *cost
+= extra_cost
->alu
.bfx
;
7582 if (VECTOR_MODE_P (mode
))
7585 *cost
+= extra_cost
->vect
.alu
;
7589 /* We generate an AND instead of UXTB/UXTH. */
7590 *cost
+= extra_cost
->alu
.logical
;
7596 if (MEM_P (XEXP (x
, 0)))
7601 rtx address
= XEXP (XEXP (x
, 0), 0);
7602 *cost
+= extra_cost
->ldst
.load_sign_extend
;
7605 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
7611 op0
= aarch64_extend_bitfield_pattern_p (x
);
7614 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
7616 *cost
+= extra_cost
->alu
.bfx
;
7622 if (VECTOR_MODE_P (mode
))
7623 *cost
+= extra_cost
->vect
.alu
;
7625 *cost
+= extra_cost
->alu
.extend
;
7633 if (CONST_INT_P (op1
))
7637 if (VECTOR_MODE_P (mode
))
7639 /* Vector shift (immediate). */
7640 *cost
+= extra_cost
->vect
.alu
;
7644 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7646 *cost
+= extra_cost
->alu
.shift
;
7650 /* We can incorporate zero/sign extend for free. */
7651 if (GET_CODE (op0
) == ZERO_EXTEND
7652 || GET_CODE (op0
) == SIGN_EXTEND
)
7653 op0
= XEXP (op0
, 0);
7655 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
7660 if (VECTOR_MODE_P (mode
))
7663 /* Vector shift (register). */
7664 *cost
+= extra_cost
->vect
.alu
;
7670 *cost
+= extra_cost
->alu
.shift_reg
;
7672 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
7673 && CONST_INT_P (XEXP (op1
, 1))
7674 && INTVAL (XEXP (op1
, 1)) == GET_MODE_BITSIZE (mode
) - 1)
7676 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
7677 /* We already demanded XEXP (op1, 0) to be REG_P, so
7678 don't recurse into it. */
7682 return false; /* All arguments need to be in registers. */
7692 if (CONST_INT_P (op1
))
7694 /* ASR (immediate) and friends. */
7697 if (VECTOR_MODE_P (mode
))
7698 *cost
+= extra_cost
->vect
.alu
;
7700 *cost
+= extra_cost
->alu
.shift
;
7703 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7708 if (VECTOR_MODE_P (mode
))
7711 /* Vector shift (register). */
7712 *cost
+= extra_cost
->vect
.alu
;
7717 /* ASR (register) and friends. */
7718 *cost
+= extra_cost
->alu
.shift_reg
;
7720 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
7721 && CONST_INT_P (XEXP (op1
, 1))
7722 && INTVAL (XEXP (op1
, 1)) == GET_MODE_BITSIZE (mode
) - 1)
7724 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
7725 /* We already demanded XEXP (op1, 0) to be REG_P, so
7726 don't recurse into it. */
7730 return false; /* All arguments need to be in registers. */
7735 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
7736 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
7740 *cost
+= extra_cost
->ldst
.load
;
7742 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
7743 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
7745 /* ADRP, followed by ADD. */
7746 *cost
+= COSTS_N_INSNS (1);
7748 *cost
+= 2 * extra_cost
->alu
.arith
;
7750 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
7751 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
7755 *cost
+= extra_cost
->alu
.arith
;
7760 /* One extra load instruction, after accessing the GOT. */
7761 *cost
+= COSTS_N_INSNS (1);
7763 *cost
+= extra_cost
->ldst
.load
;
7769 /* ADRP/ADD (immediate). */
7771 *cost
+= extra_cost
->alu
.arith
;
7779 if (VECTOR_MODE_P (mode
))
7780 *cost
+= extra_cost
->vect
.alu
;
7782 *cost
+= extra_cost
->alu
.bfx
;
7785 /* We can trust that the immediates used will be correct (there
7786 are no by-register forms), so we need only cost op0. */
7787 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7791 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
7792 /* aarch64_rtx_mult_cost always handles recursion to its
7797 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7798 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7799 an unconditional negate. This case should only ever be reached through
7800 the set_smod_pow2_cheap check in expmed.c. */
7801 if (CONST_INT_P (XEXP (x
, 1))
7802 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
7803 && (mode
== SImode
|| mode
== DImode
))
7805 /* We expand to 4 instructions. Reset the baseline. */
7806 *cost
= COSTS_N_INSNS (4);
7809 *cost
+= 2 * extra_cost
->alu
.logical
7810 + 2 * extra_cost
->alu
.arith
;
7819 /* Slighly prefer UMOD over SMOD. */
7820 if (VECTOR_MODE_P (mode
))
7821 *cost
+= extra_cost
->vect
.alu
;
7822 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7823 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
7824 + extra_cost
->mult
[mode
== DImode
].idiv
7825 + (code
== MOD
? 1 : 0));
7827 return false; /* All arguments need to be in registers. */
7834 if (VECTOR_MODE_P (mode
))
7835 *cost
+= extra_cost
->vect
.alu
;
7836 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7837 /* There is no integer SQRT, so only DIV and UDIV can get
7839 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
7840 /* Slighly prefer UDIV over SDIV. */
7841 + (code
== DIV
? 1 : 0));
7843 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
7845 return false; /* All arguments need to be in registers. */
7848 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
7849 XEXP (x
, 2), cost
, speed
);
7862 return false; /* All arguments must be in registers. */
7871 if (VECTOR_MODE_P (mode
))
7872 *cost
+= extra_cost
->vect
.alu
;
7874 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
7877 /* FMSUB, FNMADD, and FNMSUB are free. */
7878 if (GET_CODE (op0
) == NEG
)
7879 op0
= XEXP (op0
, 0);
7881 if (GET_CODE (op2
) == NEG
)
7882 op2
= XEXP (op2
, 0);
7884 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7885 and the by-element operand as operand 0. */
7886 if (GET_CODE (op1
) == NEG
)
7887 op1
= XEXP (op1
, 0);
7889 /* Catch vector-by-element operations. The by-element operand can
7890 either be (vec_duplicate (vec_select (x))) or just
7891 (vec_select (x)), depending on whether we are multiplying by
7892 a vector or a scalar.
7894 Canonicalization is not very good in these cases, FMA4 will put the
7895 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7896 if (GET_CODE (op0
) == VEC_DUPLICATE
)
7897 op0
= XEXP (op0
, 0);
7898 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
7899 op1
= XEXP (op1
, 0);
7901 if (GET_CODE (op0
) == VEC_SELECT
)
7902 op0
= XEXP (op0
, 0);
7903 else if (GET_CODE (op1
) == VEC_SELECT
)
7904 op1
= XEXP (op1
, 0);
7906 /* If the remaining parameters are not registers,
7907 get the cost to put them into registers. */
7908 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
7909 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
7910 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
7914 case UNSIGNED_FLOAT
:
7916 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
7922 if (VECTOR_MODE_P (mode
))
7924 /*Vector truncate. */
7925 *cost
+= extra_cost
->vect
.alu
;
7928 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
7932 case FLOAT_TRUNCATE
:
7935 if (VECTOR_MODE_P (mode
))
7937 /*Vector conversion. */
7938 *cost
+= extra_cost
->vect
.alu
;
7941 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
7948 /* Strip the rounding part. They will all be implemented
7949 by the fcvt* family of instructions anyway. */
7950 if (GET_CODE (x
) == UNSPEC
)
7952 unsigned int uns_code
= XINT (x
, 1);
7954 if (uns_code
== UNSPEC_FRINTA
7955 || uns_code
== UNSPEC_FRINTM
7956 || uns_code
== UNSPEC_FRINTN
7957 || uns_code
== UNSPEC_FRINTP
7958 || uns_code
== UNSPEC_FRINTZ
)
7959 x
= XVECEXP (x
, 0, 0);
7964 if (VECTOR_MODE_P (mode
))
7965 *cost
+= extra_cost
->vect
.alu
;
7967 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
7970 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7971 fixed-point fcvt. */
7972 if (GET_CODE (x
) == MULT
7973 && ((VECTOR_MODE_P (mode
)
7974 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
7975 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
7977 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
7982 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7986 if (VECTOR_MODE_P (mode
))
7990 *cost
+= extra_cost
->vect
.alu
;
7992 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7996 /* FABD, which is analogous to FADD. */
7997 if (GET_CODE (op0
) == MINUS
)
7999 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
8000 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
8002 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8006 /* Simple FABS is analogous to FNEG. */
8008 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8012 /* Integer ABS will either be split to
8013 two arithmetic instructions, or will be an ABS
8014 (scalar), which we don't model. */
8015 *cost
= COSTS_N_INSNS (2);
8017 *cost
+= 2 * extra_cost
->alu
.arith
;
8025 if (VECTOR_MODE_P (mode
))
8026 *cost
+= extra_cost
->vect
.alu
;
8029 /* FMAXNM/FMINNM/FMAX/FMIN.
8030 TODO: This may not be accurate for all implementations, but
8031 we do not model this in the cost tables. */
8032 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8038 /* The floating point round to integer frint* instructions. */
8039 if (aarch64_frint_unspec_p (XINT (x
, 1)))
8042 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
8047 if (XINT (x
, 1) == UNSPEC_RBIT
)
8050 *cost
+= extra_cost
->alu
.rev
;
8058 /* Decompose <su>muldi3_highpart. */
8059 if (/* (truncate:DI */
8062 && GET_MODE (XEXP (x
, 0)) == TImode
8063 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
8065 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
8066 /* (ANY_EXTEND:TI (reg:DI))
8067 (ANY_EXTEND:TI (reg:DI))) */
8068 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
8069 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
8070 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
8071 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
8072 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
8073 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
8074 /* (const_int 64) */
8075 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8076 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
8080 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
8081 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
8082 mode
, MULT
, 0, speed
);
8083 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
8084 mode
, MULT
, 1, speed
);
8094 && flag_aarch64_verbose_cost
)
8096 "\nFailed to cost RTX. Assuming default cost.\n");
8101 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8102 calculated for X. This cost is stored in *COST. Returns true
8103 if the total cost of X was calculated. */
8105 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
8106 int param
, int *cost
, bool speed
)
8108 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
8111 && flag_aarch64_verbose_cost
)
8113 print_rtl_single (dump_file
, x
);
8114 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
8115 speed
? "Hot" : "Cold",
8116 *cost
, result
? "final" : "partial");
8123 aarch64_register_move_cost (machine_mode mode
,
8124 reg_class_t from_i
, reg_class_t to_i
)
8126 enum reg_class from
= (enum reg_class
) from_i
;
8127 enum reg_class to
= (enum reg_class
) to_i
;
8128 const struct cpu_regmove_cost
*regmove_cost
8129 = aarch64_tune_params
.regmove_cost
;
8131 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8132 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
8135 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
8136 from
= GENERAL_REGS
;
8138 /* Moving between GPR and stack cost is the same as GP2GP. */
8139 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
8140 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
8141 return regmove_cost
->GP2GP
;
8143 /* To/From the stack register, we move via the gprs. */
8144 if (to
== STACK_REG
|| from
== STACK_REG
)
8145 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
8146 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
8148 if (GET_MODE_SIZE (mode
) == 16)
8150 /* 128-bit operations on general registers require 2 instructions. */
8151 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
8152 return regmove_cost
->GP2GP
* 2;
8153 else if (from
== GENERAL_REGS
)
8154 return regmove_cost
->GP2FP
* 2;
8155 else if (to
== GENERAL_REGS
)
8156 return regmove_cost
->FP2GP
* 2;
8158 /* When AdvSIMD instructions are disabled it is not possible to move
8159 a 128-bit value directly between Q registers. This is handled in
8160 secondary reload. A general register is used as a scratch to move
8161 the upper DI value and the lower DI value is moved directly,
8162 hence the cost is the sum of three moves. */
8164 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
8166 return regmove_cost
->FP2FP
;
8169 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
8170 return regmove_cost
->GP2GP
;
8171 else if (from
== GENERAL_REGS
)
8172 return regmove_cost
->GP2FP
;
8173 else if (to
== GENERAL_REGS
)
8174 return regmove_cost
->FP2GP
;
8176 return regmove_cost
->FP2FP
;
8180 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
8181 reg_class_t rclass ATTRIBUTE_UNUSED
,
8182 bool in ATTRIBUTE_UNUSED
)
8184 return aarch64_tune_params
.memmov_cost
;
8187 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8188 to optimize 1.0/sqrt. */
8191 use_rsqrt_p (machine_mode mode
)
8193 return (!flag_trapping_math
8194 && flag_unsafe_math_optimizations
8195 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
8196 & AARCH64_APPROX_MODE (mode
))
8197 || flag_mrecip_low_precision_sqrt
));
8200 /* Function to decide when to use the approximate reciprocal square root
8204 aarch64_builtin_reciprocal (tree fndecl
)
8206 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
8208 if (!use_rsqrt_p (mode
))
8210 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
8213 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
8215 /* Select reciprocal square root initial estimate insn depending on machine
8219 get_rsqrte_type (machine_mode mode
)
8223 case DFmode
: return gen_aarch64_rsqrtedf
;
8224 case SFmode
: return gen_aarch64_rsqrtesf
;
8225 case V2DFmode
: return gen_aarch64_rsqrtev2df
;
8226 case V2SFmode
: return gen_aarch64_rsqrtev2sf
;
8227 case V4SFmode
: return gen_aarch64_rsqrtev4sf
;
8228 default: gcc_unreachable ();
8232 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
8234 /* Select reciprocal square root series step insn depending on machine mode. */
8237 get_rsqrts_type (machine_mode mode
)
8241 case DFmode
: return gen_aarch64_rsqrtsdf
;
8242 case SFmode
: return gen_aarch64_rsqrtssf
;
8243 case V2DFmode
: return gen_aarch64_rsqrtsv2df
;
8244 case V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
8245 case V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
8246 default: gcc_unreachable ();
8250 /* Emit instruction sequence to compute either the approximate square root
8251 or its approximate reciprocal, depending on the flag RECP, and return
8252 whether the sequence was emitted or not. */
8255 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
8257 machine_mode mode
= GET_MODE (dst
);
8259 if (GET_MODE_INNER (mode
) == HFmode
)
8266 = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode
)),
8267 GET_MODE_NUNITS (mode
));
8270 if (!(flag_mlow_precision_sqrt
8271 || (aarch64_tune_params
.approx_modes
->sqrt
8272 & AARCH64_APPROX_MODE (mode
))))
8275 if (flag_finite_math_only
8276 || flag_trapping_math
8277 || !flag_unsafe_math_optimizations
8278 || optimize_function_for_size_p (cfun
))
8282 /* Caller assumes we cannot fail. */
8283 gcc_assert (use_rsqrt_p (mode
));
8286 rtx xmsk
= gen_reg_rtx (mmsk
);
8288 /* When calculating the approximate square root, compare the
8289 argument with 0.0 and create a mask. */
8290 emit_insn (gen_rtx_SET (xmsk
,
8292 gen_rtx_EQ (mmsk
, src
,
8293 CONST0_RTX (mode
)))));
8295 /* Estimate the approximate reciprocal square root. */
8296 rtx xdst
= gen_reg_rtx (mode
);
8297 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
8299 /* Iterate over the series twice for SF and thrice for DF. */
8300 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
8302 /* Optionally iterate over the series once less for faster performance
8303 while sacrificing the accuracy. */
8304 if ((recp
&& flag_mrecip_low_precision_sqrt
)
8305 || (!recp
&& flag_mlow_precision_sqrt
))
8308 /* Iterate over the series to calculate the approximate reciprocal square
8310 rtx x1
= gen_reg_rtx (mode
);
8311 while (iterations
--)
8313 rtx x2
= gen_reg_rtx (mode
);
8314 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
8316 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
8319 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
8324 /* Qualify the approximate reciprocal square root when the argument is
8325 0.0 by squashing the intermediary result to 0.0. */
8326 rtx xtmp
= gen_reg_rtx (mmsk
);
8327 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
8328 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
8329 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
8331 /* Calculate the approximate square root. */
8332 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
8335 /* Finalize the approximation. */
8336 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
8341 typedef rtx (*recpe_type
) (rtx
, rtx
);
8343 /* Select reciprocal initial estimate insn depending on machine mode. */
8346 get_recpe_type (machine_mode mode
)
8350 case SFmode
: return (gen_aarch64_frecpesf
);
8351 case V2SFmode
: return (gen_aarch64_frecpev2sf
);
8352 case V4SFmode
: return (gen_aarch64_frecpev4sf
);
8353 case DFmode
: return (gen_aarch64_frecpedf
);
8354 case V2DFmode
: return (gen_aarch64_frecpev2df
);
8355 default: gcc_unreachable ();
8359 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
8361 /* Select reciprocal series step insn depending on machine mode. */
8364 get_recps_type (machine_mode mode
)
8368 case SFmode
: return (gen_aarch64_frecpssf
);
8369 case V2SFmode
: return (gen_aarch64_frecpsv2sf
);
8370 case V4SFmode
: return (gen_aarch64_frecpsv4sf
);
8371 case DFmode
: return (gen_aarch64_frecpsdf
);
8372 case V2DFmode
: return (gen_aarch64_frecpsv2df
);
8373 default: gcc_unreachable ();
8377 /* Emit the instruction sequence to compute the approximation for the division
8378 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8381 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
8383 machine_mode mode
= GET_MODE (quo
);
8385 if (GET_MODE_INNER (mode
) == HFmode
)
8388 bool use_approx_division_p
= (flag_mlow_precision_div
8389 || (aarch64_tune_params
.approx_modes
->division
8390 & AARCH64_APPROX_MODE (mode
)));
8392 if (!flag_finite_math_only
8393 || flag_trapping_math
8394 || !flag_unsafe_math_optimizations
8395 || optimize_function_for_size_p (cfun
)
8396 || !use_approx_division_p
)
8399 /* Estimate the approximate reciprocal. */
8400 rtx xrcp
= gen_reg_rtx (mode
);
8401 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
8403 /* Iterate over the series twice for SF and thrice for DF. */
8404 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
8406 /* Optionally iterate over the series once less for faster performance,
8407 while sacrificing the accuracy. */
8408 if (flag_mlow_precision_div
)
8411 /* Iterate over the series to calculate the approximate reciprocal. */
8412 rtx xtmp
= gen_reg_rtx (mode
);
8413 while (iterations
--)
8415 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
8418 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
8421 if (num
!= CONST1_RTX (mode
))
8423 /* As the approximate reciprocal of DEN is already calculated, only
8424 calculate the approximate division when NUM is not 1.0. */
8425 rtx xnum
= force_reg (mode
, num
);
8426 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
8429 /* Finalize the approximation. */
8430 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
8434 /* Return the number of instructions that can be issued per cycle. */
8436 aarch64_sched_issue_rate (void)
8438 return aarch64_tune_params
.issue_rate
;
8442 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8444 int issue_rate
= aarch64_sched_issue_rate ();
8446 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
8450 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8451 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8452 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8455 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
8458 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
8462 /* Vectorizer cost model target hooks. */
8464 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8466 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
8468 int misalign ATTRIBUTE_UNUSED
)
8471 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
8474 if (vectype
!= NULL
)
8475 fp
= FLOAT_TYPE_P (vectype
);
8477 switch (type_of_cost
)
8480 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
8483 return costs
->scalar_load_cost
;
8486 return costs
->scalar_store_cost
;
8489 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
8492 return costs
->vec_align_load_cost
;
8495 return costs
->vec_store_cost
;
8498 return costs
->vec_to_scalar_cost
;
8501 return costs
->scalar_to_vec_cost
;
8503 case unaligned_load
:
8504 return costs
->vec_unalign_load_cost
;
8506 case unaligned_store
:
8507 return costs
->vec_unalign_store_cost
;
8509 case cond_branch_taken
:
8510 return costs
->cond_taken_branch_cost
;
8512 case cond_branch_not_taken
:
8513 return costs
->cond_not_taken_branch_cost
;
8516 return costs
->vec_permute_cost
;
8518 case vec_promote_demote
:
8519 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
8522 elements
= TYPE_VECTOR_SUBPARTS (vectype
);
8523 return elements
/ 2 + 1;
8530 /* Implement targetm.vectorize.add_stmt_cost. */
8532 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
8533 struct _stmt_vec_info
*stmt_info
, int misalign
,
8534 enum vect_cost_model_location where
)
8536 unsigned *cost
= (unsigned *) data
;
8537 unsigned retval
= 0;
8539 if (flag_vect_cost_model
)
8541 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
8543 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
8545 /* Statements in an inner loop relative to the loop being
8546 vectorized are weighted more heavily. The value here is
8547 arbitrary and could potentially be improved with analysis. */
8548 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
8549 count
*= 50; /* FIXME */
8551 retval
= (unsigned) (count
* stmt_cost
);
8552 cost
[where
] += retval
;
8558 static void initialize_aarch64_code_model (struct gcc_options
*);
8560 /* Parse the TO_PARSE string and put the architecture struct that it
8561 selects into RES and the architectural features into ISA_FLAGS.
8562 Return an aarch64_parse_opt_result describing the parse result.
8563 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8565 static enum aarch64_parse_opt_result
8566 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
8567 unsigned long *isa_flags
)
8570 const struct processor
*arch
;
8571 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8574 strcpy (str
, to_parse
);
8576 ext
= strchr (str
, '+');
8584 return AARCH64_PARSE_MISSING_ARG
;
8587 /* Loop through the list of supported ARCHes to find a match. */
8588 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
8590 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
8592 unsigned long isa_temp
= arch
->flags
;
8596 /* TO_PARSE string contains at least one extension. */
8597 enum aarch64_parse_opt_result ext_res
8598 = aarch64_parse_extension (ext
, &isa_temp
);
8600 if (ext_res
!= AARCH64_PARSE_OK
)
8603 /* Extension parsing was successful. Confirm the result
8604 arch and ISA flags. */
8606 *isa_flags
= isa_temp
;
8607 return AARCH64_PARSE_OK
;
8611 /* ARCH name not found in list. */
8612 return AARCH64_PARSE_INVALID_ARG
;
8615 /* Parse the TO_PARSE string and put the result tuning in RES and the
8616 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8617 describing the parse result. If there is an error parsing, RES and
8618 ISA_FLAGS are left unchanged. */
8620 static enum aarch64_parse_opt_result
8621 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
8622 unsigned long *isa_flags
)
8625 const struct processor
*cpu
;
8626 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8629 strcpy (str
, to_parse
);
8631 ext
= strchr (str
, '+');
8639 return AARCH64_PARSE_MISSING_ARG
;
8642 /* Loop through the list of supported CPUs to find a match. */
8643 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8645 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
8647 unsigned long isa_temp
= cpu
->flags
;
8652 /* TO_PARSE string contains at least one extension. */
8653 enum aarch64_parse_opt_result ext_res
8654 = aarch64_parse_extension (ext
, &isa_temp
);
8656 if (ext_res
!= AARCH64_PARSE_OK
)
8659 /* Extension parsing was successfull. Confirm the result
8660 cpu and ISA flags. */
8662 *isa_flags
= isa_temp
;
8663 return AARCH64_PARSE_OK
;
8667 /* CPU name not found in list. */
8668 return AARCH64_PARSE_INVALID_ARG
;
8671 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8672 Return an aarch64_parse_opt_result describing the parse result.
8673 If the parsing fails the RES does not change. */
8675 static enum aarch64_parse_opt_result
8676 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
8678 const struct processor
*cpu
;
8679 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8681 strcpy (str
, to_parse
);
8683 /* Loop through the list of supported CPUs to find a match. */
8684 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8686 if (strcmp (cpu
->name
, str
) == 0)
8689 return AARCH64_PARSE_OK
;
8693 /* CPU name not found in list. */
8694 return AARCH64_PARSE_INVALID_ARG
;
8697 /* Parse TOKEN, which has length LENGTH to see if it is an option
8698 described in FLAG. If it is, return the index bit for that fusion type.
8699 If not, error (printing OPTION_NAME) and return zero. */
8702 aarch64_parse_one_option_token (const char *token
,
8704 const struct aarch64_flag_desc
*flag
,
8705 const char *option_name
)
8707 for (; flag
->name
!= NULL
; flag
++)
8709 if (length
== strlen (flag
->name
)
8710 && !strncmp (flag
->name
, token
, length
))
8714 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
8718 /* Parse OPTION which is a comma-separated list of flags to enable.
8719 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8720 default state we inherit from the CPU tuning structures. OPTION_NAME
8721 gives the top-level option we are parsing in the -moverride string,
8722 for use in error messages. */
8725 aarch64_parse_boolean_options (const char *option
,
8726 const struct aarch64_flag_desc
*flags
,
8727 unsigned int initial_state
,
8728 const char *option_name
)
8730 const char separator
= '.';
8731 const char* specs
= option
;
8732 const char* ntoken
= option
;
8733 unsigned int found_flags
= initial_state
;
8735 while ((ntoken
= strchr (specs
, separator
)))
8737 size_t token_length
= ntoken
- specs
;
8738 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8742 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8743 in the token stream, reset the supported operations. So:
8745 adrp+add.cmp+branch.none.adrp+add
8747 would have the result of turning on only adrp+add fusion. */
8751 found_flags
|= token_ops
;
8755 /* We ended with a comma, print something. */
8758 error ("%s string ill-formed\n", option_name
);
8762 /* We still have one more token to parse. */
8763 size_t token_length
= strlen (specs
);
8764 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8771 found_flags
|= token_ops
;
8775 /* Support for overriding instruction fusion. */
8778 aarch64_parse_fuse_string (const char *fuse_string
,
8779 struct tune_params
*tune
)
8781 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
8782 aarch64_fusible_pairs
,
8787 /* Support for overriding other tuning flags. */
8790 aarch64_parse_tune_string (const char *tune_string
,
8791 struct tune_params
*tune
)
8793 tune
->extra_tuning_flags
8794 = aarch64_parse_boolean_options (tune_string
,
8795 aarch64_tuning_flags
,
8796 tune
->extra_tuning_flags
,
8800 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8801 we understand. If it is, extract the option string and handoff to
8802 the appropriate function. */
8805 aarch64_parse_one_override_token (const char* token
,
8807 struct tune_params
*tune
)
8809 const struct aarch64_tuning_override_function
*fn
8810 = aarch64_tuning_override_functions
;
8812 const char *option_part
= strchr (token
, '=');
8815 error ("tuning string missing in option (%s)", token
);
8819 /* Get the length of the option name. */
8820 length
= option_part
- token
;
8821 /* Skip the '=' to get to the option string. */
8824 for (; fn
->name
!= NULL
; fn
++)
8826 if (!strncmp (fn
->name
, token
, length
))
8828 fn
->parse_override (option_part
, tune
);
8833 error ("unknown tuning option (%s)",token
);
8837 /* A checking mechanism for the implementation of the tls size. */
8840 initialize_aarch64_tls_size (struct gcc_options
*opts
)
8842 if (aarch64_tls_size
== 0)
8843 aarch64_tls_size
= 24;
8845 switch (opts
->x_aarch64_cmodel_var
)
8847 case AARCH64_CMODEL_TINY
:
8848 /* Both the default and maximum TLS size allowed under tiny is 1M which
8849 needs two instructions to address, so we clamp the size to 24. */
8850 if (aarch64_tls_size
> 24)
8851 aarch64_tls_size
= 24;
8853 case AARCH64_CMODEL_SMALL
:
8854 /* The maximum TLS size allowed under small is 4G. */
8855 if (aarch64_tls_size
> 32)
8856 aarch64_tls_size
= 32;
8858 case AARCH64_CMODEL_LARGE
:
8859 /* The maximum TLS size allowed under large is 16E.
8860 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8861 if (aarch64_tls_size
> 48)
8862 aarch64_tls_size
= 48;
8871 /* Parse STRING looking for options in the format:
8872 string :: option:string
8873 option :: name=substring
8875 substring :: defined by option. */
8878 aarch64_parse_override_string (const char* input_string
,
8879 struct tune_params
* tune
)
8881 const char separator
= ':';
8882 size_t string_length
= strlen (input_string
) + 1;
8883 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
8884 char *string
= string_root
;
8885 strncpy (string
, input_string
, string_length
);
8886 string
[string_length
- 1] = '\0';
8888 char* ntoken
= string
;
8890 while ((ntoken
= strchr (string
, separator
)))
8892 size_t token_length
= ntoken
- string
;
8893 /* Make this substring look like a string. */
8895 aarch64_parse_one_override_token (string
, token_length
, tune
);
8899 /* One last option to parse. */
8900 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
8906 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
8908 /* The logic here is that if we are disabling all frame pointer generation
8909 then we do not need to disable leaf frame pointer generation as a
8910 separate operation. But if we are *only* disabling leaf frame pointer
8911 generation then we set flag_omit_frame_pointer to true, but in
8912 aarch64_frame_pointer_required we return false only for leaf functions.
8914 PR 70044: We have to be careful about being called multiple times for the
8915 same function. Once we have decided to set flag_omit_frame_pointer just
8916 so that we can omit leaf frame pointers, we must then not interpret a
8917 second call as meaning that all frame pointer generation should be
8918 omitted. We do this by setting flag_omit_frame_pointer to a special,
8920 if (opts
->x_flag_omit_frame_pointer
== 2)
8921 opts
->x_flag_omit_frame_pointer
= 0;
8923 if (opts
->x_flag_omit_frame_pointer
)
8924 opts
->x_flag_omit_leaf_frame_pointer
= false;
8925 else if (opts
->x_flag_omit_leaf_frame_pointer
)
8926 opts
->x_flag_omit_frame_pointer
= 2;
8928 /* If not optimizing for size, set the default
8929 alignment to what the target wants. */
8930 if (!opts
->x_optimize_size
)
8932 if (opts
->x_align_loops
<= 0)
8933 opts
->x_align_loops
= aarch64_tune_params
.loop_align
;
8934 if (opts
->x_align_jumps
<= 0)
8935 opts
->x_align_jumps
= aarch64_tune_params
.jump_align
;
8936 if (opts
->x_align_functions
<= 0)
8937 opts
->x_align_functions
= aarch64_tune_params
.function_align
;
8940 /* We default to no pc-relative literal loads. */
8942 aarch64_pcrelative_literal_loads
= false;
8944 /* If -mpc-relative-literal-loads is set on the command line, this
8945 implies that the user asked for PC relative literal loads. */
8946 if (opts
->x_pcrelative_literal_loads
== 1)
8947 aarch64_pcrelative_literal_loads
= true;
8949 /* This is PR70113. When building the Linux kernel with
8950 CONFIG_ARM64_ERRATUM_843419, support for relocations
8951 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8952 removed from the kernel to avoid loading objects with possibly
8953 offending sequences. Without -mpc-relative-literal-loads we would
8954 generate such relocations, preventing the kernel build from
8956 if (opts
->x_pcrelative_literal_loads
== 2
8957 && TARGET_FIX_ERR_A53_843419
)
8958 aarch64_pcrelative_literal_loads
= true;
8960 /* In the tiny memory model it makes no sense to disallow PC relative
8961 literal pool loads. */
8962 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
8963 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
8964 aarch64_pcrelative_literal_loads
= true;
8966 /* When enabling the lower precision Newton series for the square root, also
8967 enable it for the reciprocal square root, since the latter is an
8968 intermediary step for the former. */
8969 if (flag_mlow_precision_sqrt
)
8970 flag_mrecip_low_precision_sqrt
= true;
8973 /* 'Unpack' up the internal tuning structs and update the options
8974 in OPTS. The caller must have set up selected_tune and selected_arch
8975 as all the other target-specific codegen decisions are
8976 derived from them. */
8979 aarch64_override_options_internal (struct gcc_options
*opts
)
8981 aarch64_tune_flags
= selected_tune
->flags
;
8982 aarch64_tune
= selected_tune
->sched_core
;
8983 /* Make a copy of the tuning parameters attached to the core, which
8984 we may later overwrite. */
8985 aarch64_tune_params
= *(selected_tune
->tune
);
8986 aarch64_architecture_version
= selected_arch
->architecture_version
;
8988 if (opts
->x_aarch64_override_tune_string
)
8989 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
8990 &aarch64_tune_params
);
8992 /* This target defaults to strict volatile bitfields. */
8993 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
8994 opts
->x_flag_strict_volatile_bitfields
= 1;
8996 initialize_aarch64_code_model (opts
);
8997 initialize_aarch64_tls_size (opts
);
8999 int queue_depth
= 0;
9000 switch (aarch64_tune_params
.autoprefetcher_model
)
9002 case tune_params::AUTOPREFETCHER_OFF
:
9005 case tune_params::AUTOPREFETCHER_WEAK
:
9008 case tune_params::AUTOPREFETCHER_STRONG
:
9009 queue_depth
= max_insn_queue_index
+ 1;
9015 /* We don't mind passing in global_options_set here as we don't use
9016 the *options_set structs anyway. */
9017 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
9019 opts
->x_param_values
,
9020 global_options_set
.x_param_values
);
9022 /* Set up parameters to be used in prefetching algorithm. Do not
9023 override the defaults unless we are tuning for a core we have
9024 researched values for. */
9025 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
9026 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
9027 aarch64_tune_params
.prefetch
->num_slots
,
9028 opts
->x_param_values
,
9029 global_options_set
.x_param_values
);
9030 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
9031 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
9032 aarch64_tune_params
.prefetch
->l1_cache_size
,
9033 opts
->x_param_values
,
9034 global_options_set
.x_param_values
);
9035 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
9036 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
9037 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
9038 opts
->x_param_values
,
9039 global_options_set
.x_param_values
);
9040 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
9041 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
9042 aarch64_tune_params
.prefetch
->l2_cache_size
,
9043 opts
->x_param_values
,
9044 global_options_set
.x_param_values
);
9046 /* Enable sw prefetching at specified optimization level for
9047 CPUS that have prefetch. Lower optimization level threshold by 1
9048 when profiling is enabled. */
9049 if (opts
->x_flag_prefetch_loop_arrays
< 0
9050 && !opts
->x_optimize_size
9051 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
9052 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
9053 opts
->x_flag_prefetch_loop_arrays
= 1;
9055 aarch64_override_options_after_change_1 (opts
);
9058 /* Print a hint with a suggestion for a core or architecture name that
9059 most closely resembles what the user passed in STR. ARCH is true if
9060 the user is asking for an architecture name. ARCH is false if the user
9061 is asking for a core name. */
9064 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
9066 auto_vec
<const char *> candidates
;
9067 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
9068 for (; entry
->name
!= NULL
; entry
++)
9069 candidates
.safe_push (entry
->name
);
9071 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
9073 inform (input_location
, "valid arguments are: %s;"
9074 " did you mean %qs?", s
, hint
);
9078 /* Print a hint with a suggestion for a core name that most closely resembles
9079 what the user passed in STR. */
9082 aarch64_print_hint_for_core (const char *str
)
9084 aarch64_print_hint_for_core_or_arch (str
, false);
9087 /* Print a hint with a suggestion for an architecture name that most closely
9088 resembles what the user passed in STR. */
9091 aarch64_print_hint_for_arch (const char *str
)
9093 aarch64_print_hint_for_core_or_arch (str
, true);
9096 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9097 specified in STR and throw errors if appropriate. Put the results if
9098 they are valid in RES and ISA_FLAGS. Return whether the option is
9102 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
9103 unsigned long *isa_flags
)
9105 enum aarch64_parse_opt_result parse_res
9106 = aarch64_parse_cpu (str
, res
, isa_flags
);
9108 if (parse_res
== AARCH64_PARSE_OK
)
9113 case AARCH64_PARSE_MISSING_ARG
:
9114 error ("missing cpu name in %<-mcpu=%s%>", str
);
9116 case AARCH64_PARSE_INVALID_ARG
:
9117 error ("unknown value %qs for -mcpu", str
);
9118 aarch64_print_hint_for_core (str
);
9120 case AARCH64_PARSE_INVALID_FEATURE
:
9121 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
9130 /* Validate a command-line -march option. Parse the arch and extensions
9131 (if any) specified in STR and throw errors if appropriate. Put the
9132 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9136 aarch64_validate_march (const char *str
, const struct processor
**res
,
9137 unsigned long *isa_flags
)
9139 enum aarch64_parse_opt_result parse_res
9140 = aarch64_parse_arch (str
, res
, isa_flags
);
9142 if (parse_res
== AARCH64_PARSE_OK
)
9147 case AARCH64_PARSE_MISSING_ARG
:
9148 error ("missing arch name in %<-march=%s%>", str
);
9150 case AARCH64_PARSE_INVALID_ARG
:
9151 error ("unknown value %qs for -march", str
);
9152 aarch64_print_hint_for_arch (str
);
9154 case AARCH64_PARSE_INVALID_FEATURE
:
9155 error ("invalid feature modifier in %<-march=%s%>", str
);
9164 /* Validate a command-line -mtune option. Parse the cpu
9165 specified in STR and throw errors if appropriate. Put the
9166 result, if it is valid, in RES. Return whether the option is
9170 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
9172 enum aarch64_parse_opt_result parse_res
9173 = aarch64_parse_tune (str
, res
);
9175 if (parse_res
== AARCH64_PARSE_OK
)
9180 case AARCH64_PARSE_MISSING_ARG
:
9181 error ("missing cpu name in %<-mtune=%s%>", str
);
9183 case AARCH64_PARSE_INVALID_ARG
:
9184 error ("unknown value %qs for -mtune", str
);
9185 aarch64_print_hint_for_core (str
);
9193 /* Return the CPU corresponding to the enum CPU.
9194 If it doesn't specify a cpu, return the default. */
9196 static const struct processor
*
9197 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
9199 if (cpu
!= aarch64_none
)
9200 return &all_cores
[cpu
];
9202 /* The & 0x3f is to extract the bottom 6 bits that encode the
9203 default cpu as selected by the --with-cpu GCC configure option
9205 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9206 flags mechanism should be reworked to make it more sane. */
9207 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
9210 /* Return the architecture corresponding to the enum ARCH.
9211 If it doesn't specify a valid architecture, return the default. */
9213 static const struct processor
*
9214 aarch64_get_arch (enum aarch64_arch arch
)
9216 if (arch
!= aarch64_no_arch
)
9217 return &all_architectures
[arch
];
9219 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
9221 return &all_architectures
[cpu
->arch
];
9224 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9225 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9226 tuning structs. In particular it must set selected_tune and
9227 aarch64_isa_flags that define the available ISA features and tuning
9228 decisions. It must also set selected_arch as this will be used to
9229 output the .arch asm tags for each function. */
9232 aarch64_override_options (void)
9234 unsigned long cpu_isa
= 0;
9235 unsigned long arch_isa
= 0;
9236 aarch64_isa_flags
= 0;
9238 bool valid_cpu
= true;
9239 bool valid_tune
= true;
9240 bool valid_arch
= true;
9242 selected_cpu
= NULL
;
9243 selected_arch
= NULL
;
9244 selected_tune
= NULL
;
9246 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9247 If either of -march or -mtune is given, they override their
9248 respective component of -mcpu. */
9249 if (aarch64_cpu_string
)
9250 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
9253 if (aarch64_arch_string
)
9254 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
9257 if (aarch64_tune_string
)
9258 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
9260 /* If the user did not specify a processor, choose the default
9261 one for them. This will be the CPU set during configuration using
9262 --with-cpu, otherwise it is "generic". */
9267 selected_cpu
= &all_cores
[selected_arch
->ident
];
9268 aarch64_isa_flags
= arch_isa
;
9269 explicit_arch
= selected_arch
->arch
;
9273 /* Get default configure-time CPU. */
9274 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
9275 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
9279 explicit_tune_core
= selected_tune
->ident
;
9281 /* If both -mcpu and -march are specified check that they are architecturally
9282 compatible, warn if they're not and prefer the -march ISA flags. */
9283 else if (selected_arch
)
9285 if (selected_arch
->arch
!= selected_cpu
->arch
)
9287 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9288 all_architectures
[selected_cpu
->arch
].name
,
9289 selected_arch
->name
);
9291 aarch64_isa_flags
= arch_isa
;
9292 explicit_arch
= selected_arch
->arch
;
9293 explicit_tune_core
= selected_tune
? selected_tune
->ident
9294 : selected_cpu
->ident
;
9298 /* -mcpu but no -march. */
9299 aarch64_isa_flags
= cpu_isa
;
9300 explicit_tune_core
= selected_tune
? selected_tune
->ident
9301 : selected_cpu
->ident
;
9302 gcc_assert (selected_cpu
);
9303 selected_arch
= &all_architectures
[selected_cpu
->arch
];
9304 explicit_arch
= selected_arch
->arch
;
9307 /* Set the arch as well as we will need it when outputing
9308 the .arch directive in assembly. */
9311 gcc_assert (selected_cpu
);
9312 selected_arch
= &all_architectures
[selected_cpu
->arch
];
9316 selected_tune
= selected_cpu
;
9318 #ifndef HAVE_AS_MABI_OPTION
9319 /* The compiler may have been configured with 2.23.* binutils, which does
9320 not have support for ILP32. */
9322 error ("Assembler does not support -mabi=ilp32");
9325 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
9326 sorry ("Return address signing is only supported for -mabi=lp64");
9328 /* Make sure we properly set up the explicit options. */
9329 if ((aarch64_cpu_string
&& valid_cpu
)
9330 || (aarch64_tune_string
&& valid_tune
))
9331 gcc_assert (explicit_tune_core
!= aarch64_none
);
9333 if ((aarch64_cpu_string
&& valid_cpu
)
9334 || (aarch64_arch_string
&& valid_arch
))
9335 gcc_assert (explicit_arch
!= aarch64_no_arch
);
9337 aarch64_override_options_internal (&global_options
);
9339 /* Save these options as the default ones in case we push and pop them later
9340 while processing functions with potential target attributes. */
9341 target_option_default_node
= target_option_current_node
9342 = build_target_option_node (&global_options
);
9345 /* Implement targetm.override_options_after_change. */
9348 aarch64_override_options_after_change (void)
9350 aarch64_override_options_after_change_1 (&global_options
);
9353 static struct machine_function
*
9354 aarch64_init_machine_status (void)
9356 struct machine_function
*machine
;
9357 machine
= ggc_cleared_alloc
<machine_function
> ();
9362 aarch64_init_expanders (void)
9364 init_machine_status
= aarch64_init_machine_status
;
9367 /* A checking mechanism for the implementation of the various code models. */
9369 initialize_aarch64_code_model (struct gcc_options
*opts
)
9371 if (opts
->x_flag_pic
)
9373 switch (opts
->x_aarch64_cmodel_var
)
9375 case AARCH64_CMODEL_TINY
:
9376 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
9378 case AARCH64_CMODEL_SMALL
:
9379 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9380 aarch64_cmodel
= (flag_pic
== 2
9381 ? AARCH64_CMODEL_SMALL_PIC
9382 : AARCH64_CMODEL_SMALL_SPIC
);
9384 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
9387 case AARCH64_CMODEL_LARGE
:
9388 sorry ("code model %qs with -f%s", "large",
9389 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
9396 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
9399 /* Implement TARGET_OPTION_SAVE. */
9402 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
9404 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
9407 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9408 using the information saved in PTR. */
9411 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
9413 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
9414 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
9415 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
9416 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
9417 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
9419 aarch64_override_options_internal (opts
);
9422 /* Implement TARGET_OPTION_PRINT. */
9425 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
9427 const struct processor
*cpu
9428 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
9429 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
9430 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
9431 std::string extension
9432 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
9434 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
9435 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
9436 arch
->name
, extension
.c_str ());
9439 static GTY(()) tree aarch64_previous_fndecl
;
9442 aarch64_reset_previous_fndecl (void)
9444 aarch64_previous_fndecl
= NULL
;
9447 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9448 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9449 make sure optab availability predicates are recomputed when necessary. */
9452 aarch64_save_restore_target_globals (tree new_tree
)
9454 if (TREE_TARGET_GLOBALS (new_tree
))
9455 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
9456 else if (new_tree
== target_option_default_node
)
9457 restore_target_globals (&default_target_globals
);
9459 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
9462 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9463 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9464 of the function, if such exists. This function may be called multiple
9465 times on a single function so use aarch64_previous_fndecl to avoid
9466 setting up identical state. */
9469 aarch64_set_current_function (tree fndecl
)
9471 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
9474 tree old_tree
= (aarch64_previous_fndecl
9475 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
9478 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
9480 /* If current function has no attributes but the previous one did,
9481 use the default node. */
9482 if (!new_tree
&& old_tree
)
9483 new_tree
= target_option_default_node
;
9485 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9486 the default have been handled by aarch64_save_restore_target_globals from
9487 aarch64_pragma_target_parse. */
9488 if (old_tree
== new_tree
)
9491 aarch64_previous_fndecl
= fndecl
;
9493 /* First set the target options. */
9494 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
9496 aarch64_save_restore_target_globals (new_tree
);
9499 /* Enum describing the various ways we can handle attributes.
9500 In many cases we can reuse the generic option handling machinery. */
9502 enum aarch64_attr_opt_type
9504 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
9505 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
9506 aarch64_attr_enum
, /* Attribute sets an enum variable. */
9507 aarch64_attr_custom
/* Attribute requires a custom handling function. */
9510 /* All the information needed to handle a target attribute.
9511 NAME is the name of the attribute.
9512 ATTR_TYPE specifies the type of behavior of the attribute as described
9513 in the definition of enum aarch64_attr_opt_type.
9514 ALLOW_NEG is true if the attribute supports a "no-" form.
9515 HANDLER is the function that takes the attribute string and whether
9516 it is a pragma or attribute and handles the option. It is needed only
9517 when the ATTR_TYPE is aarch64_attr_custom.
9518 OPT_NUM is the enum specifying the option that the attribute modifies.
9519 This is needed for attributes that mirror the behavior of a command-line
9520 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9521 aarch64_attr_enum. */
9523 struct aarch64_attribute_info
9526 enum aarch64_attr_opt_type attr_type
;
9528 bool (*handler
) (const char *, const char *);
9529 enum opt_code opt_num
;
9532 /* Handle the ARCH_STR argument to the arch= target attribute.
9533 PRAGMA_OR_ATTR is used in potential error messages. */
9536 aarch64_handle_attr_arch (const char *str
, const char *pragma_or_attr
)
9538 const struct processor
*tmp_arch
= NULL
;
9539 enum aarch64_parse_opt_result parse_res
9540 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
9542 if (parse_res
== AARCH64_PARSE_OK
)
9544 gcc_assert (tmp_arch
);
9545 selected_arch
= tmp_arch
;
9546 explicit_arch
= selected_arch
->arch
;
9552 case AARCH64_PARSE_MISSING_ARG
:
9553 error ("missing architecture name in 'arch' target %s", pragma_or_attr
);
9555 case AARCH64_PARSE_INVALID_ARG
:
9556 error ("unknown value %qs for 'arch' target %s", str
, pragma_or_attr
);
9557 aarch64_print_hint_for_arch (str
);
9559 case AARCH64_PARSE_INVALID_FEATURE
:
9560 error ("invalid feature modifier %qs for 'arch' target %s",
9561 str
, pragma_or_attr
);
9570 /* Handle the argument CPU_STR to the cpu= target attribute.
9571 PRAGMA_OR_ATTR is used in potential error messages. */
9574 aarch64_handle_attr_cpu (const char *str
, const char *pragma_or_attr
)
9576 const struct processor
*tmp_cpu
= NULL
;
9577 enum aarch64_parse_opt_result parse_res
9578 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
9580 if (parse_res
== AARCH64_PARSE_OK
)
9582 gcc_assert (tmp_cpu
);
9583 selected_tune
= tmp_cpu
;
9584 explicit_tune_core
= selected_tune
->ident
;
9586 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
9587 explicit_arch
= selected_arch
->arch
;
9593 case AARCH64_PARSE_MISSING_ARG
:
9594 error ("missing cpu name in 'cpu' target %s", pragma_or_attr
);
9596 case AARCH64_PARSE_INVALID_ARG
:
9597 error ("unknown value %qs for 'cpu' target %s", str
, pragma_or_attr
);
9598 aarch64_print_hint_for_core (str
);
9600 case AARCH64_PARSE_INVALID_FEATURE
:
9601 error ("invalid feature modifier %qs for 'cpu' target %s",
9602 str
, pragma_or_attr
);
9611 /* Handle the argument STR to the tune= target attribute.
9612 PRAGMA_OR_ATTR is used in potential error messages. */
9615 aarch64_handle_attr_tune (const char *str
, const char *pragma_or_attr
)
9617 const struct processor
*tmp_tune
= NULL
;
9618 enum aarch64_parse_opt_result parse_res
9619 = aarch64_parse_tune (str
, &tmp_tune
);
9621 if (parse_res
== AARCH64_PARSE_OK
)
9623 gcc_assert (tmp_tune
);
9624 selected_tune
= tmp_tune
;
9625 explicit_tune_core
= selected_tune
->ident
;
9631 case AARCH64_PARSE_INVALID_ARG
:
9632 error ("unknown value %qs for 'tune' target %s", str
, pragma_or_attr
);
9633 aarch64_print_hint_for_core (str
);
9642 /* Parse an architecture extensions target attribute string specified in STR.
9643 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9644 if successful. Update aarch64_isa_flags to reflect the ISA features
9646 PRAGMA_OR_ATTR is used in potential error messages. */
9649 aarch64_handle_attr_isa_flags (char *str
, const char *pragma_or_attr
)
9651 enum aarch64_parse_opt_result parse_res
;
9652 unsigned long isa_flags
= aarch64_isa_flags
;
9654 /* We allow "+nothing" in the beginning to clear out all architectural
9655 features if the user wants to handpick specific features. */
9656 if (strncmp ("+nothing", str
, 8) == 0)
9662 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
9664 if (parse_res
== AARCH64_PARSE_OK
)
9666 aarch64_isa_flags
= isa_flags
;
9672 case AARCH64_PARSE_MISSING_ARG
:
9673 error ("missing feature modifier in target %s %qs",
9674 pragma_or_attr
, str
);
9677 case AARCH64_PARSE_INVALID_FEATURE
:
9678 error ("invalid feature modifier in target %s %qs",
9679 pragma_or_attr
, str
);
9689 /* The target attributes that we support. On top of these we also support just
9690 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9691 handled explicitly in aarch64_process_one_target_attr. */
9693 static const struct aarch64_attribute_info aarch64_attributes
[] =
9695 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
9696 OPT_mgeneral_regs_only
},
9697 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
9698 OPT_mfix_cortex_a53_835769
},
9699 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
9700 OPT_mfix_cortex_a53_843419
},
9701 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
9702 { "strict-align", aarch64_attr_mask
, false, NULL
, OPT_mstrict_align
},
9703 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
9704 OPT_momit_leaf_frame_pointer
},
9705 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
9706 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
9708 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
9709 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
9711 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
9712 OPT_msign_return_address_
},
9713 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
9716 /* Parse ARG_STR which contains the definition of one target attribute.
9717 Show appropriate errors if any or return true if the attribute is valid.
9718 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9719 we're processing a target attribute or pragma. */
9722 aarch64_process_one_target_attr (char *arg_str
, const char* pragma_or_attr
)
9724 bool invert
= false;
9726 size_t len
= strlen (arg_str
);
9730 error ("malformed target %s", pragma_or_attr
);
9734 char *str_to_check
= (char *) alloca (len
+ 1);
9735 strcpy (str_to_check
, arg_str
);
9737 /* Skip leading whitespace. */
9738 while (*str_to_check
== ' ' || *str_to_check
== '\t')
9741 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9742 It is easier to detect and handle it explicitly here rather than going
9743 through the machinery for the rest of the target attributes in this
9745 if (*str_to_check
== '+')
9746 return aarch64_handle_attr_isa_flags (str_to_check
, pragma_or_attr
);
9748 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
9753 char *arg
= strchr (str_to_check
, '=');
9755 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9756 and point ARG to "foo". */
9762 const struct aarch64_attribute_info
*p_attr
;
9764 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
9766 /* If the names don't match up, or the user has given an argument
9767 to an attribute that doesn't accept one, or didn't give an argument
9768 to an attribute that expects one, fail to match. */
9769 if (strcmp (str_to_check
, p_attr
->name
) != 0)
9773 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
9774 || p_attr
->attr_type
== aarch64_attr_enum
;
9776 if (attr_need_arg_p
^ (arg
!= NULL
))
9778 error ("target %s %qs does not accept an argument",
9779 pragma_or_attr
, str_to_check
);
9783 /* If the name matches but the attribute does not allow "no-" versions
9784 then we can't match. */
9785 if (invert
&& !p_attr
->allow_neg
)
9787 error ("target %s %qs does not allow a negated form",
9788 pragma_or_attr
, str_to_check
);
9792 switch (p_attr
->attr_type
)
9794 /* Has a custom handler registered.
9795 For example, cpu=, arch=, tune=. */
9796 case aarch64_attr_custom
:
9797 gcc_assert (p_attr
->handler
);
9798 if (!p_attr
->handler (arg
, pragma_or_attr
))
9802 /* Either set or unset a boolean option. */
9803 case aarch64_attr_bool
:
9805 struct cl_decoded_option decoded
;
9807 generate_option (p_attr
->opt_num
, NULL
, !invert
,
9808 CL_TARGET
, &decoded
);
9809 aarch64_handle_option (&global_options
, &global_options_set
,
9810 &decoded
, input_location
);
9813 /* Set or unset a bit in the target_flags. aarch64_handle_option
9814 should know what mask to apply given the option number. */
9815 case aarch64_attr_mask
:
9817 struct cl_decoded_option decoded
;
9818 /* We only need to specify the option number.
9819 aarch64_handle_option will know which mask to apply. */
9820 decoded
.opt_index
= p_attr
->opt_num
;
9821 decoded
.value
= !invert
;
9822 aarch64_handle_option (&global_options
, &global_options_set
,
9823 &decoded
, input_location
);
9826 /* Use the option setting machinery to set an option to an enum. */
9827 case aarch64_attr_enum
:
9832 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
9836 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
9837 NULL
, DK_UNSPECIFIED
, input_location
,
9842 error ("target %s %s=%s is not valid",
9843 pragma_or_attr
, str_to_check
, arg
);
9852 /* If we reached here we either have found an attribute and validated
9853 it or didn't match any. If we matched an attribute but its arguments
9854 were malformed we will have returned false already. */
9858 /* Count how many times the character C appears in
9859 NULL-terminated string STR. */
9862 num_occurences_in_str (char c
, char *str
)
9864 unsigned int res
= 0;
9865 while (*str
!= '\0')
9876 /* Parse the tree in ARGS that contains the target attribute information
9877 and update the global target options space. PRAGMA_OR_ATTR is a string
9878 to be used in error messages, specifying whether this is processing
9879 a target attribute or a target pragma. */
9882 aarch64_process_target_attr (tree args
, const char* pragma_or_attr
)
9884 if (TREE_CODE (args
) == TREE_LIST
)
9888 tree head
= TREE_VALUE (args
);
9891 if (!aarch64_process_target_attr (head
, pragma_or_attr
))
9894 args
= TREE_CHAIN (args
);
9900 if (TREE_CODE (args
) != STRING_CST
)
9902 error ("attribute %<target%> argument not a string");
9906 size_t len
= strlen (TREE_STRING_POINTER (args
));
9907 char *str_to_check
= (char *) alloca (len
+ 1);
9908 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
9912 error ("malformed target %s value", pragma_or_attr
);
9916 /* Used to catch empty spaces between commas i.e.
9917 attribute ((target ("attr1,,attr2"))). */
9918 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
9920 /* Handle multiple target attributes separated by ','. */
9921 char *token
= strtok (str_to_check
, ",");
9923 unsigned int num_attrs
= 0;
9927 if (!aarch64_process_one_target_attr (token
, pragma_or_attr
))
9929 error ("target %s %qs is invalid", pragma_or_attr
, token
);
9933 token
= strtok (NULL
, ",");
9936 if (num_attrs
!= num_commas
+ 1)
9938 error ("malformed target %s list %qs",
9939 pragma_or_attr
, TREE_STRING_POINTER (args
));
9946 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9947 process attribute ((target ("..."))). */
9950 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
9952 struct cl_target_option cur_target
;
9955 tree new_target
, new_optimize
;
9956 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
9958 /* If what we're processing is the current pragma string then the
9959 target option node is already stored in target_option_current_node
9960 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9961 having to re-parse the string. This is especially useful to keep
9962 arm_neon.h compile times down since that header contains a lot
9963 of intrinsics enclosed in pragmas. */
9964 if (!existing_target
&& args
== current_target_pragma
)
9966 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
9969 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
9971 old_optimize
= build_optimization_node (&global_options
);
9972 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
9974 /* If the function changed the optimization levels as well as setting
9975 target options, start with the optimizations specified. */
9976 if (func_optimize
&& func_optimize
!= old_optimize
)
9977 cl_optimization_restore (&global_options
,
9978 TREE_OPTIMIZATION (func_optimize
));
9980 /* Save the current target options to restore at the end. */
9981 cl_target_option_save (&cur_target
, &global_options
);
9983 /* If fndecl already has some target attributes applied to it, unpack
9984 them so that we add this attribute on top of them, rather than
9985 overwriting them. */
9986 if (existing_target
)
9988 struct cl_target_option
*existing_options
9989 = TREE_TARGET_OPTION (existing_target
);
9991 if (existing_options
)
9992 cl_target_option_restore (&global_options
, existing_options
);
9995 cl_target_option_restore (&global_options
,
9996 TREE_TARGET_OPTION (target_option_current_node
));
9999 ret
= aarch64_process_target_attr (args
, "attribute");
10001 /* Set up any additional state. */
10004 aarch64_override_options_internal (&global_options
);
10005 /* Initialize SIMD builtins if we haven't already.
10006 Set current_target_pragma to NULL for the duration so that
10007 the builtin initialization code doesn't try to tag the functions
10008 being built with the attributes specified by any current pragma, thus
10009 going into an infinite recursion. */
10012 tree saved_current_target_pragma
= current_target_pragma
;
10013 current_target_pragma
= NULL
;
10014 aarch64_init_simd_builtins ();
10015 current_target_pragma
= saved_current_target_pragma
;
10017 new_target
= build_target_option_node (&global_options
);
10022 new_optimize
= build_optimization_node (&global_options
);
10026 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
10028 if (old_optimize
!= new_optimize
)
10029 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
10032 cl_target_option_restore (&global_options
, &cur_target
);
10034 if (old_optimize
!= new_optimize
)
10035 cl_optimization_restore (&global_options
,
10036 TREE_OPTIMIZATION (old_optimize
));
10040 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10041 tri-bool options (yes, no, don't care) and the default value is
10042 DEF, determine whether to reject inlining. */
10045 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
10046 int dont_care
, int def
)
10048 /* If the callee doesn't care, always allow inlining. */
10049 if (callee
== dont_care
)
10052 /* If the caller doesn't care, always allow inlining. */
10053 if (caller
== dont_care
)
10056 /* Otherwise, allow inlining if either the callee and caller values
10057 agree, or if the callee is using the default value. */
10058 return (callee
== caller
|| callee
== def
);
10061 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10062 to inline CALLEE into CALLER based on target-specific info.
10063 Make sure that the caller and callee have compatible architectural
10064 features. Then go through the other possible target attributes
10065 and see if they can block inlining. Try not to reject always_inline
10066 callees unless they are incompatible architecturally. */
10069 aarch64_can_inline_p (tree caller
, tree callee
)
10071 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
10072 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
10074 /* If callee has no option attributes, then it is ok to inline. */
10078 struct cl_target_option
*caller_opts
10079 = TREE_TARGET_OPTION (caller_tree
? caller_tree
10080 : target_option_default_node
);
10082 struct cl_target_option
*callee_opts
= TREE_TARGET_OPTION (callee_tree
);
10085 /* Callee's ISA flags should be a subset of the caller's. */
10086 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
10087 != callee_opts
->x_aarch64_isa_flags
)
10090 /* Allow non-strict aligned functions inlining into strict
10092 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
10093 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
10094 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
10095 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
10098 bool always_inline
= lookup_attribute ("always_inline",
10099 DECL_ATTRIBUTES (callee
));
10101 /* If the architectural features match up and the callee is always_inline
10102 then the other attributes don't matter. */
10106 if (caller_opts
->x_aarch64_cmodel_var
10107 != callee_opts
->x_aarch64_cmodel_var
)
10110 if (caller_opts
->x_aarch64_tls_dialect
10111 != callee_opts
->x_aarch64_tls_dialect
)
10114 /* Honour explicit requests to workaround errata. */
10115 if (!aarch64_tribools_ok_for_inlining_p (
10116 caller_opts
->x_aarch64_fix_a53_err835769
,
10117 callee_opts
->x_aarch64_fix_a53_err835769
,
10118 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
10121 if (!aarch64_tribools_ok_for_inlining_p (
10122 caller_opts
->x_aarch64_fix_a53_err843419
,
10123 callee_opts
->x_aarch64_fix_a53_err843419
,
10124 2, TARGET_FIX_ERR_A53_843419
))
10127 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10128 caller and calle and they don't match up, reject inlining. */
10129 if (!aarch64_tribools_ok_for_inlining_p (
10130 caller_opts
->x_flag_omit_leaf_frame_pointer
,
10131 callee_opts
->x_flag_omit_leaf_frame_pointer
,
10135 /* If the callee has specific tuning overrides, respect them. */
10136 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
10137 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
10140 /* If the user specified tuning override strings for the
10141 caller and callee and they don't match up, reject inlining.
10142 We just do a string compare here, we don't analyze the meaning
10143 of the string, as it would be too costly for little gain. */
10144 if (callee_opts
->x_aarch64_override_tune_string
10145 && caller_opts
->x_aarch64_override_tune_string
10146 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
10147 caller_opts
->x_aarch64_override_tune_string
) != 0))
10153 /* Return true if SYMBOL_REF X binds locally. */
10156 aarch64_symbol_binds_local_p (const_rtx x
)
10158 return (SYMBOL_REF_DECL (x
)
10159 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
10160 : SYMBOL_REF_LOCAL_P (x
));
10163 /* Return true if SYMBOL_REF X is thread local */
10165 aarch64_tls_symbol_p (rtx x
)
10167 if (! TARGET_HAVE_TLS
)
10170 if (GET_CODE (x
) != SYMBOL_REF
)
10173 return SYMBOL_REF_TLS_MODEL (x
) != 0;
10176 /* Classify a TLS symbol into one of the TLS kinds. */
10177 enum aarch64_symbol_type
10178 aarch64_classify_tls_symbol (rtx x
)
10180 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
10184 case TLS_MODEL_GLOBAL_DYNAMIC
:
10185 case TLS_MODEL_LOCAL_DYNAMIC
:
10186 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
10188 case TLS_MODEL_INITIAL_EXEC
:
10189 switch (aarch64_cmodel
)
10191 case AARCH64_CMODEL_TINY
:
10192 case AARCH64_CMODEL_TINY_PIC
:
10193 return SYMBOL_TINY_TLSIE
;
10195 return SYMBOL_SMALL_TLSIE
;
10198 case TLS_MODEL_LOCAL_EXEC
:
10199 if (aarch64_tls_size
== 12)
10200 return SYMBOL_TLSLE12
;
10201 else if (aarch64_tls_size
== 24)
10202 return SYMBOL_TLSLE24
;
10203 else if (aarch64_tls_size
== 32)
10204 return SYMBOL_TLSLE32
;
10205 else if (aarch64_tls_size
== 48)
10206 return SYMBOL_TLSLE48
;
10208 gcc_unreachable ();
10210 case TLS_MODEL_EMULATED
:
10211 case TLS_MODEL_NONE
:
10212 return SYMBOL_FORCE_TO_MEM
;
10215 gcc_unreachable ();
10219 /* Return the method that should be used to access SYMBOL_REF or
10222 enum aarch64_symbol_type
10223 aarch64_classify_symbol (rtx x
, rtx offset
)
10225 if (GET_CODE (x
) == LABEL_REF
)
10227 switch (aarch64_cmodel
)
10229 case AARCH64_CMODEL_LARGE
:
10230 return SYMBOL_FORCE_TO_MEM
;
10232 case AARCH64_CMODEL_TINY_PIC
:
10233 case AARCH64_CMODEL_TINY
:
10234 return SYMBOL_TINY_ABSOLUTE
;
10236 case AARCH64_CMODEL_SMALL_SPIC
:
10237 case AARCH64_CMODEL_SMALL_PIC
:
10238 case AARCH64_CMODEL_SMALL
:
10239 return SYMBOL_SMALL_ABSOLUTE
;
10242 gcc_unreachable ();
10246 if (GET_CODE (x
) == SYMBOL_REF
)
10248 if (aarch64_tls_symbol_p (x
))
10249 return aarch64_classify_tls_symbol (x
);
10251 switch (aarch64_cmodel
)
10253 case AARCH64_CMODEL_TINY
:
10254 /* When we retrieve symbol + offset address, we have to make sure
10255 the offset does not cause overflow of the final address. But
10256 we have no way of knowing the address of symbol at compile time
10257 so we can't accurately say if the distance between the PC and
10258 symbol + offset is outside the addressible range of +/-1M in the
10259 TINY code model. So we rely on images not being greater than
10260 1M and cap the offset at 1M and anything beyond 1M will have to
10261 be loaded using an alternative mechanism. Furthermore if the
10262 symbol is a weak reference to something that isn't known to
10263 resolve to a symbol in this module, then force to memory. */
10264 if ((SYMBOL_REF_WEAK (x
)
10265 && !aarch64_symbol_binds_local_p (x
))
10266 || INTVAL (offset
) < -1048575 || INTVAL (offset
) > 1048575)
10267 return SYMBOL_FORCE_TO_MEM
;
10268 return SYMBOL_TINY_ABSOLUTE
;
10270 case AARCH64_CMODEL_SMALL
:
10271 /* Same reasoning as the tiny code model, but the offset cap here is
10273 if ((SYMBOL_REF_WEAK (x
)
10274 && !aarch64_symbol_binds_local_p (x
))
10275 || !IN_RANGE (INTVAL (offset
), HOST_WIDE_INT_C (-4294967263),
10276 HOST_WIDE_INT_C (4294967264)))
10277 return SYMBOL_FORCE_TO_MEM
;
10278 return SYMBOL_SMALL_ABSOLUTE
;
10280 case AARCH64_CMODEL_TINY_PIC
:
10281 if (!aarch64_symbol_binds_local_p (x
))
10282 return SYMBOL_TINY_GOT
;
10283 return SYMBOL_TINY_ABSOLUTE
;
10285 case AARCH64_CMODEL_SMALL_SPIC
:
10286 case AARCH64_CMODEL_SMALL_PIC
:
10287 if (!aarch64_symbol_binds_local_p (x
))
10288 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
10289 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
10290 return SYMBOL_SMALL_ABSOLUTE
;
10292 case AARCH64_CMODEL_LARGE
:
10293 /* This is alright even in PIC code as the constant
10294 pool reference is always PC relative and within
10295 the same translation unit. */
10296 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
10297 return SYMBOL_SMALL_ABSOLUTE
;
10299 return SYMBOL_FORCE_TO_MEM
;
10302 gcc_unreachable ();
10306 /* By default push everything into the constant pool. */
10307 return SYMBOL_FORCE_TO_MEM
;
10311 aarch64_constant_address_p (rtx x
)
10313 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
10317 aarch64_legitimate_pic_operand_p (rtx x
)
10319 if (GET_CODE (x
) == SYMBOL_REF
10320 || (GET_CODE (x
) == CONST
10321 && GET_CODE (XEXP (x
, 0)) == PLUS
10322 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
10328 /* Return true if X holds either a quarter-precision or
10329 floating-point +0.0 constant. */
10331 aarch64_valid_floating_const (rtx x
)
10333 if (!CONST_DOUBLE_P (x
))
10336 /* This call determines which constants can be used in mov<mode>
10337 as integer moves instead of constant loads. */
10338 if (aarch64_float_const_rtx_p (x
))
10341 return aarch64_float_const_representable_p (x
);
10345 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
10347 /* Do not allow vector struct mode constants. We could support
10348 0 and -1 easily, but they need support in aarch64-simd.md. */
10349 if (TARGET_SIMD
&& aarch64_vect_struct_mode_p (mode
))
10352 /* For these cases we never want to use a literal load.
10353 As such we have to prevent the compiler from forcing these
10355 if ((GET_CODE (x
) == CONST_VECTOR
10356 && aarch64_simd_valid_immediate (x
, mode
, false, NULL
))
10358 || aarch64_valid_floating_const (x
)
10359 || aarch64_can_const_movi_rtx_p (x
, mode
)
10360 || aarch64_float_const_rtx_p (x
))
10361 return !targetm
.cannot_force_const_mem (mode
, x
);
10363 if (GET_CODE (x
) == HIGH
10364 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
10367 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10368 so spilling them is better than rematerialization. */
10369 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
10372 return aarch64_constant_address_p (x
);
10376 aarch64_load_tp (rtx target
)
10379 || GET_MODE (target
) != Pmode
10380 || !register_operand (target
, Pmode
))
10381 target
= gen_reg_rtx (Pmode
);
10383 /* Can return in any reg. */
10384 emit_insn (gen_aarch64_load_tp_hard (target
));
10388 /* On AAPCS systems, this is the "struct __va_list". */
10389 static GTY(()) tree va_list_type
;
10391 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10392 Return the type to use as __builtin_va_list.
10394 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10406 aarch64_build_builtin_va_list (void)
10409 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10411 /* Create the type. */
10412 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
10413 /* Give it the required name. */
10414 va_list_name
= build_decl (BUILTINS_LOCATION
,
10416 get_identifier ("__va_list"),
10418 DECL_ARTIFICIAL (va_list_name
) = 1;
10419 TYPE_NAME (va_list_type
) = va_list_name
;
10420 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
10422 /* Create the fields. */
10423 f_stack
= build_decl (BUILTINS_LOCATION
,
10424 FIELD_DECL
, get_identifier ("__stack"),
10426 f_grtop
= build_decl (BUILTINS_LOCATION
,
10427 FIELD_DECL
, get_identifier ("__gr_top"),
10429 f_vrtop
= build_decl (BUILTINS_LOCATION
,
10430 FIELD_DECL
, get_identifier ("__vr_top"),
10432 f_groff
= build_decl (BUILTINS_LOCATION
,
10433 FIELD_DECL
, get_identifier ("__gr_offs"),
10434 integer_type_node
);
10435 f_vroff
= build_decl (BUILTINS_LOCATION
,
10436 FIELD_DECL
, get_identifier ("__vr_offs"),
10437 integer_type_node
);
10439 /* Tell tree-stdarg pass about our internal offset fields.
10440 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10441 purpose to identify whether the code is updating va_list internal
10442 offset fields through irregular way. */
10443 va_list_gpr_counter_field
= f_groff
;
10444 va_list_fpr_counter_field
= f_vroff
;
10446 DECL_ARTIFICIAL (f_stack
) = 1;
10447 DECL_ARTIFICIAL (f_grtop
) = 1;
10448 DECL_ARTIFICIAL (f_vrtop
) = 1;
10449 DECL_ARTIFICIAL (f_groff
) = 1;
10450 DECL_ARTIFICIAL (f_vroff
) = 1;
10452 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
10453 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
10454 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
10455 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
10456 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
10458 TYPE_FIELDS (va_list_type
) = f_stack
;
10459 DECL_CHAIN (f_stack
) = f_grtop
;
10460 DECL_CHAIN (f_grtop
) = f_vrtop
;
10461 DECL_CHAIN (f_vrtop
) = f_groff
;
10462 DECL_CHAIN (f_groff
) = f_vroff
;
10464 /* Compute its layout. */
10465 layout_type (va_list_type
);
10467 return va_list_type
;
10470 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10472 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
10474 const CUMULATIVE_ARGS
*cum
;
10475 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10476 tree stack
, grtop
, vrtop
, groff
, vroff
;
10478 int gr_save_area_size
= cfun
->va_list_gpr_size
;
10479 int vr_save_area_size
= cfun
->va_list_fpr_size
;
10482 cum
= &crtl
->args
.info
;
10483 if (cfun
->va_list_gpr_size
)
10484 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
10485 cfun
->va_list_gpr_size
);
10486 if (cfun
->va_list_fpr_size
)
10487 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
10488 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
10492 gcc_assert (cum
->aapcs_nvrn
== 0);
10493 vr_save_area_size
= 0;
10496 f_stack
= TYPE_FIELDS (va_list_type_node
);
10497 f_grtop
= DECL_CHAIN (f_stack
);
10498 f_vrtop
= DECL_CHAIN (f_grtop
);
10499 f_groff
= DECL_CHAIN (f_vrtop
);
10500 f_vroff
= DECL_CHAIN (f_groff
);
10502 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
10504 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
10506 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
10508 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
10510 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
10513 /* Emit code to initialize STACK, which points to the next varargs stack
10514 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10515 by named arguments. STACK is 8-byte aligned. */
10516 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
10517 if (cum
->aapcs_stack_size
> 0)
10518 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
10519 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
10520 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10522 /* Emit code to initialize GRTOP, the top of the GR save area.
10523 virtual_incoming_args_rtx should have been 16 byte aligned. */
10524 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
10525 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
10526 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10528 /* Emit code to initialize VRTOP, the top of the VR save area.
10529 This address is gr_save_area_bytes below GRTOP, rounded
10530 down to the next 16-byte boundary. */
10531 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
10532 vr_offset
= ROUND_UP (gr_save_area_size
,
10533 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10536 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
10537 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
10538 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10540 /* Emit code to initialize GROFF, the offset from GRTOP of the
10541 next GPR argument. */
10542 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
10543 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
10544 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10546 /* Likewise emit code to initialize VROFF, the offset from FTOP
10547 of the next VR argument. */
10548 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
10549 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
10550 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10553 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10556 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
10557 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
10561 bool is_ha
; /* is HFA or HVA. */
10562 bool dw_align
; /* double-word align. */
10563 machine_mode ag_mode
= VOIDmode
;
10567 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10568 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
10569 HOST_WIDE_INT size
, rsize
, adjust
, align
;
10570 tree t
, u
, cond1
, cond2
;
10572 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
10574 type
= build_pointer_type (type
);
10576 mode
= TYPE_MODE (type
);
10578 f_stack
= TYPE_FIELDS (va_list_type_node
);
10579 f_grtop
= DECL_CHAIN (f_stack
);
10580 f_vrtop
= DECL_CHAIN (f_grtop
);
10581 f_groff
= DECL_CHAIN (f_vrtop
);
10582 f_vroff
= DECL_CHAIN (f_groff
);
10584 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
10585 f_stack
, NULL_TREE
);
10586 size
= int_size_in_bytes (type
);
10587 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
10591 if (aarch64_vfp_is_call_or_return_candidate (mode
,
10597 /* TYPE passed in fp/simd registers. */
10599 aarch64_err_no_fpadvsimd (mode
, "varargs");
10601 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
10602 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
10603 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
10604 unshare_expr (valist
), f_vroff
, NULL_TREE
);
10606 rsize
= nregs
* UNITS_PER_VREG
;
10610 if (BYTES_BIG_ENDIAN
&& GET_MODE_SIZE (ag_mode
) < UNITS_PER_VREG
)
10611 adjust
= UNITS_PER_VREG
- GET_MODE_SIZE (ag_mode
);
10613 else if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
10614 && size
< UNITS_PER_VREG
)
10616 adjust
= UNITS_PER_VREG
- size
;
10621 /* TYPE passed in general registers. */
10622 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
10623 unshare_expr (valist
), f_grtop
, NULL_TREE
);
10624 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
10625 unshare_expr (valist
), f_groff
, NULL_TREE
);
10626 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
10627 nregs
= rsize
/ UNITS_PER_WORD
;
10632 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
10633 && size
< UNITS_PER_WORD
)
10635 adjust
= UNITS_PER_WORD
- size
;
10639 /* Get a local temporary for the field value. */
10640 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
10642 /* Emit code to branch if off >= 0. */
10643 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
10644 build_int_cst (TREE_TYPE (off
), 0));
10645 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
10649 /* Emit: offs = (offs + 15) & -16. */
10650 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
10651 build_int_cst (TREE_TYPE (off
), 15));
10652 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
10653 build_int_cst (TREE_TYPE (off
), -16));
10654 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
10659 /* Update ap.__[g|v]r_offs */
10660 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
10661 build_int_cst (TREE_TYPE (off
), rsize
));
10662 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
10666 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
10668 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10669 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
10670 build_int_cst (TREE_TYPE (f_off
), 0));
10671 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
10673 /* String up: make sure the assignment happens before the use. */
10674 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
10675 COND_EXPR_ELSE (cond1
) = t
;
10677 /* Prepare the trees handling the argument that is passed on the stack;
10678 the top level node will store in ON_STACK. */
10679 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
10682 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10683 t
= fold_convert (intDI_type_node
, arg
);
10684 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
10685 build_int_cst (TREE_TYPE (t
), 15));
10686 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
10687 build_int_cst (TREE_TYPE (t
), -16));
10688 t
= fold_convert (TREE_TYPE (arg
), t
);
10689 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
10693 /* Advance ap.__stack */
10694 t
= fold_convert (intDI_type_node
, arg
);
10695 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
10696 build_int_cst (TREE_TYPE (t
), size
+ 7));
10697 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
10698 build_int_cst (TREE_TYPE (t
), -8));
10699 t
= fold_convert (TREE_TYPE (arg
), t
);
10700 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
10701 /* String up roundup and advance. */
10703 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
10704 /* String up with arg */
10705 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
10706 /* Big-endianness related address adjustment. */
10707 if (BLOCK_REG_PADDING (mode
, type
, 1) == downward
10708 && size
< UNITS_PER_WORD
)
10710 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
10711 size_int (UNITS_PER_WORD
- size
));
10712 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
10715 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
10716 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
10718 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10721 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
10722 build_int_cst (TREE_TYPE (off
), adjust
));
10724 t
= fold_convert (sizetype
, t
);
10725 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
10729 /* type ha; // treat as "struct {ftype field[n];}"
10730 ... [computing offs]
10731 for (i = 0; i <nregs; ++i, offs += 16)
10732 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10735 tree tmp_ha
, field_t
, field_ptr_t
;
10737 /* Declare a local variable. */
10738 tmp_ha
= create_tmp_var_raw (type
, "ha");
10739 gimple_add_tmp_var (tmp_ha
);
10741 /* Establish the base type. */
10745 field_t
= float_type_node
;
10746 field_ptr_t
= float_ptr_type_node
;
10749 field_t
= double_type_node
;
10750 field_ptr_t
= double_ptr_type_node
;
10753 field_t
= long_double_type_node
;
10754 field_ptr_t
= long_double_ptr_type_node
;
10757 field_t
= aarch64_fp16_type_node
;
10758 field_ptr_t
= aarch64_fp16_ptr_type_node
;
10763 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
10764 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
10765 field_ptr_t
= build_pointer_type (field_t
);
10772 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10773 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
10775 t
= fold_convert (field_ptr_t
, addr
);
10776 t
= build2 (MODIFY_EXPR
, field_t
,
10777 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
10778 build1 (INDIRECT_REF
, field_t
, t
));
10780 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10781 for (i
= 1; i
< nregs
; ++i
)
10783 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
10784 u
= fold_convert (field_ptr_t
, addr
);
10785 u
= build2 (MODIFY_EXPR
, field_t
,
10786 build2 (MEM_REF
, field_t
, tmp_ha
,
10787 build_int_cst (field_ptr_t
,
10789 int_size_in_bytes (field_t
)))),
10790 build1 (INDIRECT_REF
, field_t
, u
));
10791 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
10794 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
10795 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
10798 COND_EXPR_ELSE (cond2
) = t
;
10799 addr
= fold_convert (build_pointer_type (type
), cond1
);
10800 addr
= build_va_arg_indirect_ref (addr
);
10803 addr
= build_va_arg_indirect_ref (addr
);
10808 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10811 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
10812 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
10815 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
10816 CUMULATIVE_ARGS local_cum
;
10817 int gr_saved
= cfun
->va_list_gpr_size
;
10818 int vr_saved
= cfun
->va_list_fpr_size
;
10820 /* The caller has advanced CUM up to, but not beyond, the last named
10821 argument. Advance a local copy of CUM past the last "real" named
10822 argument, to find out how many registers are left over. */
10824 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
10826 /* Found out how many registers we need to save.
10827 Honor tree-stdvar analysis results. */
10828 if (cfun
->va_list_gpr_size
)
10829 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
10830 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
10831 if (cfun
->va_list_fpr_size
)
10832 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
10833 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
10837 gcc_assert (local_cum
.aapcs_nvrn
== 0);
10847 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10848 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
10849 - gr_saved
* UNITS_PER_WORD
);
10850 mem
= gen_frame_mem (BLKmode
, ptr
);
10851 set_mem_alias_set (mem
, get_varargs_alias_set ());
10853 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
10858 /* We can't use move_block_from_reg, because it will use
10859 the wrong mode, storing D regs only. */
10860 machine_mode mode
= TImode
;
10861 int off
, i
, vr_start
;
10863 /* Set OFF to the offset from virtual_incoming_args_rtx of
10864 the first vector register. The VR save area lies below
10865 the GR one, and is aligned to 16 bytes. */
10866 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10867 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10868 off
-= vr_saved
* UNITS_PER_VREG
;
10870 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
10871 for (i
= 0; i
< vr_saved
; ++i
)
10875 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
10876 mem
= gen_frame_mem (mode
, ptr
);
10877 set_mem_alias_set (mem
, get_varargs_alias_set ());
10878 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
10879 off
+= UNITS_PER_VREG
;
10884 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10885 any complication of having crtl->args.pretend_args_size changed. */
10886 cfun
->machine
->frame
.saved_varargs_size
10887 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10888 STACK_BOUNDARY
/ BITS_PER_UNIT
)
10889 + vr_saved
* UNITS_PER_VREG
);
10893 aarch64_conditional_register_usage (void)
10898 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
10901 call_used_regs
[i
] = 1;
10906 /* Walk down the type tree of TYPE counting consecutive base elements.
10907 If *MODEP is VOIDmode, then set it to the first valid floating point
10908 type. If a non-floating point type is found, or if a floating point
10909 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10910 otherwise return the count in the sub-tree. */
10912 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
10915 HOST_WIDE_INT size
;
10917 switch (TREE_CODE (type
))
10920 mode
= TYPE_MODE (type
);
10921 if (mode
!= DFmode
&& mode
!= SFmode
10922 && mode
!= TFmode
&& mode
!= HFmode
)
10925 if (*modep
== VOIDmode
)
10928 if (*modep
== mode
)
10934 mode
= TYPE_MODE (TREE_TYPE (type
));
10935 if (mode
!= DFmode
&& mode
!= SFmode
10936 && mode
!= TFmode
&& mode
!= HFmode
)
10939 if (*modep
== VOIDmode
)
10942 if (*modep
== mode
)
10948 /* Use V2SImode and V4SImode as representatives of all 64-bit
10949 and 128-bit vector types. */
10950 size
= int_size_in_bytes (type
);
10963 if (*modep
== VOIDmode
)
10966 /* Vector modes are considered to be opaque: two vectors are
10967 equivalent for the purposes of being homogeneous aggregates
10968 if they are the same size. */
10969 if (*modep
== mode
)
10977 tree index
= TYPE_DOMAIN (type
);
10979 /* Can't handle incomplete types nor sizes that are not
10981 if (!COMPLETE_TYPE_P (type
)
10982 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
10985 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
10988 || !TYPE_MAX_VALUE (index
)
10989 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
10990 || !TYPE_MIN_VALUE (index
)
10991 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
10995 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
10996 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
10998 /* There must be no padding. */
10999 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
11011 /* Can't handle incomplete types nor sizes that are not
11013 if (!COMPLETE_TYPE_P (type
)
11014 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
11017 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
11019 if (TREE_CODE (field
) != FIELD_DECL
)
11022 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
11025 count
+= sub_count
;
11028 /* There must be no padding. */
11029 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
11036 case QUAL_UNION_TYPE
:
11038 /* These aren't very interesting except in a degenerate case. */
11043 /* Can't handle incomplete types nor sizes that are not
11045 if (!COMPLETE_TYPE_P (type
)
11046 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
11049 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
11051 if (TREE_CODE (field
) != FIELD_DECL
)
11054 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
11057 count
= count
> sub_count
? count
: sub_count
;
11060 /* There must be no padding. */
11061 if (wi::ne_p (TYPE_SIZE (type
), count
* GET_MODE_BITSIZE (*modep
)))
11074 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11075 type as described in AAPCS64 \S 4.1.2.
11077 See the comment above aarch64_composite_type_p for the notes on MODE. */
11080 aarch64_short_vector_p (const_tree type
,
11083 HOST_WIDE_INT size
= -1;
11085 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
11086 size
= int_size_in_bytes (type
);
11087 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
11088 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
11089 size
= GET_MODE_SIZE (mode
);
11091 return (size
== 8 || size
== 16);
11094 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11095 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11096 array types. The C99 floating-point complex types are also considered
11097 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11098 types, which are GCC extensions and out of the scope of AAPCS64, are
11099 treated as composite types here as well.
11101 Note that MODE itself is not sufficient in determining whether a type
11102 is such a composite type or not. This is because
11103 stor-layout.c:compute_record_mode may have already changed the MODE
11104 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11105 structure with only one field may have its MODE set to the mode of the
11106 field. Also an integer mode whose size matches the size of the
11107 RECORD_TYPE type may be used to substitute the original mode
11108 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11109 solely relied on. */
11112 aarch64_composite_type_p (const_tree type
,
11115 if (aarch64_short_vector_p (type
, mode
))
11118 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
11121 if (mode
== BLKmode
11122 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
11123 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
11129 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11130 shall be passed or returned in simd/fp register(s) (providing these
11131 parameter passing registers are available).
11133 Upon successful return, *COUNT returns the number of needed registers,
11134 *BASE_MODE returns the mode of the individual register and when IS_HAF
11135 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11136 floating-point aggregate or a homogeneous short-vector aggregate. */
11139 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
11141 machine_mode
*base_mode
,
11145 machine_mode new_mode
= VOIDmode
;
11146 bool composite_p
= aarch64_composite_type_p (type
, mode
);
11148 if (is_ha
!= NULL
) *is_ha
= false;
11150 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11151 || aarch64_short_vector_p (type
, mode
))
11156 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
11158 if (is_ha
!= NULL
) *is_ha
= true;
11160 new_mode
= GET_MODE_INNER (mode
);
11162 else if (type
&& composite_p
)
11164 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
11166 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
11168 if (is_ha
!= NULL
) *is_ha
= true;
11177 *base_mode
= new_mode
;
11181 /* Implement TARGET_STRUCT_VALUE_RTX. */
11184 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
11185 int incoming ATTRIBUTE_UNUSED
)
11187 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
11190 /* Implements target hook vector_mode_supported_p. */
11192 aarch64_vector_mode_supported_p (machine_mode mode
)
11195 && (mode
== V4SImode
|| mode
== V8HImode
11196 || mode
== V16QImode
|| mode
== V2DImode
11197 || mode
== V2SImode
|| mode
== V4HImode
11198 || mode
== V8QImode
|| mode
== V2SFmode
11199 || mode
== V4SFmode
|| mode
== V2DFmode
11200 || mode
== V4HFmode
|| mode
== V8HFmode
11201 || mode
== V1DFmode
))
11207 /* Return appropriate SIMD container
11208 for MODE within a vector of WIDTH bits. */
11209 static machine_mode
11210 aarch64_simd_container_mode (machine_mode mode
, unsigned width
)
11212 gcc_assert (width
== 64 || width
== 128);
11255 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11256 static machine_mode
11257 aarch64_preferred_simd_mode (machine_mode mode
)
11259 return aarch64_simd_container_mode (mode
, 128);
11262 /* Return the bitmask of possible vector sizes for the vectorizer
11263 to iterate over. */
11264 static unsigned int
11265 aarch64_autovectorize_vector_sizes (void)
11270 /* Implement TARGET_MANGLE_TYPE. */
11272 static const char *
11273 aarch64_mangle_type (const_tree type
)
11275 /* The AArch64 ABI documents say that "__va_list" has to be
11276 managled as if it is in the "std" namespace. */
11277 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
11278 return "St9__va_list";
11280 /* Half-precision float. */
11281 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
11284 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11286 if (TYPE_NAME (type
) != NULL
)
11287 return aarch64_mangle_builtin_type (type
);
11289 /* Use the default mangling. */
11293 /* Find the first rtx_insn before insn that will generate an assembly
11297 aarch64_prev_real_insn (rtx_insn
*insn
)
11304 insn
= prev_real_insn (insn
);
11306 while (insn
&& recog_memoized (insn
) < 0);
11312 is_madd_op (enum attr_type t1
)
11315 /* A number of these may be AArch32 only. */
11316 enum attr_type mlatypes
[] = {
11317 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
11318 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
11319 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
11322 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
11324 if (t1
== mlatypes
[i
])
11331 /* Check if there is a register dependency between a load and the insn
11332 for which we hold recog_data. */
11335 dep_between_memop_and_curr (rtx memop
)
11340 gcc_assert (GET_CODE (memop
) == SET
);
11342 if (!REG_P (SET_DEST (memop
)))
11345 load_reg
= SET_DEST (memop
);
11346 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
11348 rtx operand
= recog_data
.operand
[opno
];
11349 if (REG_P (operand
)
11350 && reg_overlap_mentioned_p (load_reg
, operand
))
11358 /* When working around the Cortex-A53 erratum 835769,
11359 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11360 instruction and has a preceding memory instruction such that a NOP
11361 should be inserted between them. */
11364 aarch64_madd_needs_nop (rtx_insn
* insn
)
11366 enum attr_type attr_type
;
11370 if (!TARGET_FIX_ERR_A53_835769
)
11373 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
11376 attr_type
= get_attr_type (insn
);
11377 if (!is_madd_op (attr_type
))
11380 prev
= aarch64_prev_real_insn (insn
);
11381 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11382 Restore recog state to INSN to avoid state corruption. */
11383 extract_constrain_insn_cached (insn
);
11385 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
11388 body
= single_set (prev
);
11390 /* If the previous insn is a memory op and there is no dependency between
11391 it and the DImode madd, emit a NOP between them. If body is NULL then we
11392 have a complex memory operation, probably a load/store pair.
11393 Be conservative for now and emit a NOP. */
11394 if (GET_MODE (recog_data
.operand
[0]) == DImode
11395 && (!body
|| !dep_between_memop_and_curr (body
)))
11403 /* Implement FINAL_PRESCAN_INSN. */
11406 aarch64_final_prescan_insn (rtx_insn
*insn
)
11408 if (aarch64_madd_needs_nop (insn
))
11409 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
11413 /* Return the equivalent letter for size. */
11415 sizetochar (int size
)
11419 case 64: return 'd';
11420 case 32: return 's';
11421 case 16: return 'h';
11422 case 8 : return 'b';
11423 default: gcc_unreachable ();
11427 /* Return true iff x is a uniform vector of floating-point
11428 constants, and the constant can be represented in
11429 quarter-precision form. Note, as aarch64_float_const_representable
11430 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11432 aarch64_vect_float_const_representable_p (rtx x
)
11435 return (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
11436 && const_vec_duplicate_p (x
, &elt
)
11437 && aarch64_float_const_representable_p (elt
));
11440 /* Return true for valid and false for invalid. */
11442 aarch64_simd_valid_immediate (rtx op
, machine_mode mode
, bool inverse
,
11443 struct simd_immediate_info
*info
)
11445 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11447 for (i = 0; i < idx; i += (STRIDE)) \
11452 immtype = (CLASS); \
11453 elsize = (ELSIZE); \
11454 eshift = (SHIFT); \
11459 unsigned int i
, elsize
= 0, idx
= 0, n_elts
= CONST_VECTOR_NUNITS (op
);
11460 unsigned int innersize
= GET_MODE_UNIT_SIZE (mode
);
11461 unsigned char bytes
[16];
11462 int immtype
= -1, matches
;
11463 unsigned int invmask
= inverse
? 0xff : 0;
11466 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
11468 if (! (aarch64_simd_imm_zero_p (op
, mode
)
11469 || aarch64_vect_float_const_representable_p (op
)))
11474 info
->value
= CONST_VECTOR_ELT (op
, 0);
11475 info
->element_width
= GET_MODE_BITSIZE (GET_MODE (info
->value
));
11483 /* Splat vector constant out into a byte vector. */
11484 for (i
= 0; i
< n_elts
; i
++)
11486 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11487 it must be laid out in the vector register in reverse order. */
11488 rtx el
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
11489 unsigned HOST_WIDE_INT elpart
;
11491 gcc_assert (CONST_INT_P (el
));
11492 elpart
= INTVAL (el
);
11494 for (unsigned int byte
= 0; byte
< innersize
; byte
++)
11496 bytes
[idx
++] = (elpart
& 0xff) ^ invmask
;
11497 elpart
>>= BITS_PER_UNIT
;
11502 /* Sanity check. */
11503 gcc_assert (idx
== GET_MODE_SIZE (mode
));
11507 CHECK (4, 32, 0, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0
11508 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 0, 0);
11510 CHECK (4, 32, 1, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
11511 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
11513 CHECK (4, 32, 2, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11514 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
11516 CHECK (4, 32, 3, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11517 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == bytes
[3], 24, 0);
11519 CHECK (2, 16, 4, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0, 0, 0);
11521 CHECK (2, 16, 5, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1], 8, 0);
11523 CHECK (4, 32, 6, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff
11524 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 0, 1);
11526 CHECK (4, 32, 7, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
11527 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
11529 CHECK (4, 32, 8, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11530 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
11532 CHECK (4, 32, 9, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11533 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == bytes
[3], 24, 1);
11535 CHECK (2, 16, 10, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff, 0, 1);
11537 CHECK (2, 16, 11, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1], 8, 1);
11539 CHECK (4, 32, 12, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
11540 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
11542 CHECK (4, 32, 13, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
11543 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
11545 CHECK (4, 32, 14, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11546 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
11548 CHECK (4, 32, 15, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11549 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
11551 CHECK (1, 8, 16, bytes
[i
] == bytes
[0], 0, 0);
11553 CHECK (1, 64, 17, (bytes
[i
] == 0 || bytes
[i
] == 0xff)
11554 && bytes
[i
] == bytes
[(i
+ 8) % idx
], 0, 0);
11563 info
->element_width
= elsize
;
11564 info
->mvn
= emvn
!= 0;
11565 info
->shift
= eshift
;
11567 unsigned HOST_WIDE_INT imm
= 0;
11569 if (immtype
>= 12 && immtype
<= 15)
11572 /* Un-invert bytes of recognized vector, if necessary. */
11574 for (i
= 0; i
< idx
; i
++)
11575 bytes
[i
] ^= invmask
;
11579 /* FIXME: Broken on 32-bit H_W_I hosts. */
11580 gcc_assert (sizeof (HOST_WIDE_INT
) == 8);
11582 for (i
= 0; i
< 8; i
++)
11583 imm
|= (unsigned HOST_WIDE_INT
) (bytes
[i
] ? 0xff : 0)
11584 << (i
* BITS_PER_UNIT
);
11587 info
->value
= GEN_INT (imm
);
11591 for (i
= 0; i
< elsize
/ BITS_PER_UNIT
; i
++)
11592 imm
|= (unsigned HOST_WIDE_INT
) bytes
[i
] << (i
* BITS_PER_UNIT
);
11594 /* Construct 'abcdefgh' because the assembler cannot handle
11595 generic constants. */
11598 imm
= (imm
>> info
->shift
) & 0xff;
11599 info
->value
= GEN_INT (imm
);
11607 /* Check of immediate shift constants are within range. */
11609 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
11611 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
11613 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
11615 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
11618 /* Return true if X is a uniform vector where all elements
11619 are either the floating-point constant 0.0 or the
11620 integer constant 0. */
11622 aarch64_simd_imm_zero_p (rtx x
, machine_mode mode
)
11624 return x
== CONST0_RTX (mode
);
11628 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11629 operation of width WIDTH at bit position POS. */
11632 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
11634 gcc_assert (CONST_INT_P (width
));
11635 gcc_assert (CONST_INT_P (pos
));
11637 unsigned HOST_WIDE_INT mask
11638 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
11639 return GEN_INT (mask
<< UINTVAL (pos
));
11643 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
11645 if (GET_CODE (x
) == HIGH
11646 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
11649 if (CONST_INT_P (x
))
11652 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
11655 return aarch64_classify_symbolic_expression (x
)
11656 == SYMBOL_TINY_ABSOLUTE
;
11659 /* Return a const_int vector of VAL. */
11661 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
11663 int nunits
= GET_MODE_NUNITS (mode
);
11664 rtvec v
= rtvec_alloc (nunits
);
11667 rtx cache
= GEN_INT (val
);
11669 for (i
=0; i
< nunits
; i
++)
11670 RTVEC_ELT (v
, i
) = cache
;
11672 return gen_rtx_CONST_VECTOR (mode
, v
);
11675 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11678 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, machine_mode mode
)
11680 machine_mode vmode
;
11682 gcc_assert (!VECTOR_MODE_P (mode
));
11683 vmode
= aarch64_preferred_simd_mode (mode
);
11684 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
11685 return aarch64_simd_valid_immediate (op_v
, vmode
, false, NULL
);
11688 /* Construct and return a PARALLEL RTX vector with elements numbering the
11689 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11690 the vector - from the perspective of the architecture. This does not
11691 line up with GCC's perspective on lane numbers, so we end up with
11692 different masks depending on our target endian-ness. The diagram
11693 below may help. We must draw the distinction when building masks
11694 which select one half of the vector. An instruction selecting
11695 architectural low-lanes for a big-endian target, must be described using
11696 a mask selecting GCC high-lanes.
11698 Big-Endian Little-Endian
11700 GCC 0 1 2 3 3 2 1 0
11701 | x | x | x | x | | x | x | x | x |
11702 Architecture 3 2 1 0 3 2 1 0
11704 Low Mask: { 2, 3 } { 0, 1 }
11705 High Mask: { 0, 1 } { 2, 3 }
11709 aarch64_simd_vect_par_cnst_half (machine_mode mode
, bool high
)
11711 int nunits
= GET_MODE_NUNITS (mode
);
11712 rtvec v
= rtvec_alloc (nunits
/ 2);
11713 int high_base
= nunits
/ 2;
11719 if (BYTES_BIG_ENDIAN
)
11720 base
= high
? low_base
: high_base
;
11722 base
= high
? high_base
: low_base
;
11724 for (i
= 0; i
< nunits
/ 2; i
++)
11725 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
11727 t1
= gen_rtx_PARALLEL (mode
, v
);
11731 /* Check OP for validity as a PARALLEL RTX vector with elements
11732 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11733 from the perspective of the architecture. See the diagram above
11734 aarch64_simd_vect_par_cnst_half for more details. */
11737 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
11740 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, high
);
11741 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
11742 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
11745 if (!VECTOR_MODE_P (mode
))
11748 if (count_op
!= count_ideal
)
11751 for (i
= 0; i
< count_ideal
; i
++)
11753 rtx elt_op
= XVECEXP (op
, 0, i
);
11754 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
11756 if (!CONST_INT_P (elt_op
)
11757 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
11763 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11764 HIGH (exclusive). */
11766 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
11769 HOST_WIDE_INT lane
;
11770 gcc_assert (CONST_INT_P (operand
));
11771 lane
= INTVAL (operand
);
11773 if (lane
< low
|| lane
>= high
)
11776 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
11778 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
11782 /* Return TRUE if OP is a valid vector addressing mode. */
11784 aarch64_simd_mem_operand_p (rtx op
)
11786 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
11787 || REG_P (XEXP (op
, 0)));
11790 /* Emit a register copy from operand to operand, taking care not to
11791 early-clobber source registers in the process.
11793 COUNT is the number of components into which the copy needs to be
11796 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
11797 unsigned int count
)
11800 int rdest
= REGNO (operands
[0]);
11801 int rsrc
= REGNO (operands
[1]);
11803 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
11805 for (i
= 0; i
< count
; i
++)
11806 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
11807 gen_rtx_REG (mode
, rsrc
+ i
));
11809 for (i
= 0; i
< count
; i
++)
11810 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
11811 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
11814 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11815 one of VSTRUCT modes: OI, CI, or XI. */
11817 aarch64_simd_attr_length_rglist (machine_mode mode
)
11819 return (GET_MODE_SIZE (mode
) / UNITS_PER_VREG
) * 4;
11822 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11823 alignment of a vector to 128 bits. */
11824 static HOST_WIDE_INT
11825 aarch64_simd_vector_alignment (const_tree type
)
11827 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
11828 return MIN (align
, 128);
11831 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11833 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
11838 /* We guarantee alignment for vectors up to 128-bits. */
11839 if (tree_int_cst_compare (TYPE_SIZE (type
),
11840 bitsize_int (BIGGEST_ALIGNMENT
)) > 0)
11843 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11847 /* Return true if the vector misalignment factor is supported by the
11850 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
11851 const_tree type
, int misalignment
,
11854 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
11856 /* Return if movmisalign pattern is not supported for this mode. */
11857 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
11860 if (misalignment
== -1)
11862 /* Misalignment factor is unknown at compile time but we know
11863 it's word aligned. */
11864 if (aarch64_simd_vector_alignment_reachable (type
, is_packed
))
11866 int element_size
= TREE_INT_CST_LOW (TYPE_SIZE (type
));
11868 if (element_size
!= 64)
11874 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
11878 /* If VALS is a vector constant that can be loaded into a register
11879 using DUP, generate instructions to do so and return an RTX to
11880 assign to the register. Otherwise return NULL_RTX. */
11882 aarch64_simd_dup_constant (rtx vals
)
11884 machine_mode mode
= GET_MODE (vals
);
11885 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11888 if (!const_vec_duplicate_p (vals
, &x
))
11891 /* We can load this constant by using DUP and a constant in a
11892 single ARM register. This will be cheaper than a vector
11894 x
= copy_to_mode_reg (inner_mode
, x
);
11895 return gen_rtx_VEC_DUPLICATE (mode
, x
);
11899 /* Generate code to load VALS, which is a PARALLEL containing only
11900 constants (for vec_init) or CONST_VECTOR, efficiently into a
11901 register. Returns an RTX to copy into the register, or NULL_RTX
11902 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11904 aarch64_simd_make_constant (rtx vals
)
11906 machine_mode mode
= GET_MODE (vals
);
11908 rtx const_vec
= NULL_RTX
;
11909 int n_elts
= GET_MODE_NUNITS (mode
);
11913 if (GET_CODE (vals
) == CONST_VECTOR
)
11915 else if (GET_CODE (vals
) == PARALLEL
)
11917 /* A CONST_VECTOR must contain only CONST_INTs and
11918 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11919 Only store valid constants in a CONST_VECTOR. */
11920 for (i
= 0; i
< n_elts
; ++i
)
11922 rtx x
= XVECEXP (vals
, 0, i
);
11923 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11926 if (n_const
== n_elts
)
11927 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
11930 gcc_unreachable ();
11932 if (const_vec
!= NULL_RTX
11933 && aarch64_simd_valid_immediate (const_vec
, mode
, false, NULL
))
11934 /* Load using MOVI/MVNI. */
11936 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
11937 /* Loaded using DUP. */
11939 else if (const_vec
!= NULL_RTX
)
11940 /* Load from constant pool. We can not take advantage of single-cycle
11941 LD1 because we need a PC-relative addressing mode. */
11944 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11945 We can not construct an initializer. */
11949 /* Expand a vector initialisation sequence, such that TARGET is
11950 initialised to contain VALS. */
11953 aarch64_expand_vector_init (rtx target
, rtx vals
)
11955 machine_mode mode
= GET_MODE (target
);
11956 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11957 /* The number of vector elements. */
11958 int n_elts
= GET_MODE_NUNITS (mode
);
11959 /* The number of vector elements which are not constant. */
11961 rtx any_const
= NULL_RTX
;
11962 /* The first element of vals. */
11963 rtx v0
= XVECEXP (vals
, 0, 0);
11964 bool all_same
= true;
11966 /* Count the number of variable elements to initialise. */
11967 for (int i
= 0; i
< n_elts
; ++i
)
11969 rtx x
= XVECEXP (vals
, 0, i
);
11970 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
11975 all_same
&= rtx_equal_p (x
, v0
);
11978 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11979 how best to handle this. */
11982 rtx constant
= aarch64_simd_make_constant (vals
);
11983 if (constant
!= NULL_RTX
)
11985 emit_move_insn (target
, constant
);
11990 /* Splat a single non-constant element if we can. */
11993 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
11994 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
11998 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
11999 gcc_assert (icode
!= CODE_FOR_nothing
);
12001 /* If there are only variable elements, try to optimize
12002 the insertion using dup for the most common element
12003 followed by insertions. */
12005 /* The algorithm will fill matches[*][0] with the earliest matching element,
12006 and matches[X][1] with the count of duplicate elements (if X is the
12007 earliest element which has duplicates). */
12009 if (n_var
== n_elts
&& n_elts
<= 16)
12011 int matches
[16][2] = {0};
12012 for (int i
= 0; i
< n_elts
; i
++)
12014 for (int j
= 0; j
<= i
; j
++)
12016 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
12024 int maxelement
= 0;
12026 for (int i
= 0; i
< n_elts
; i
++)
12027 if (matches
[i
][1] > maxv
)
12030 maxv
= matches
[i
][1];
12033 /* Create a duplicate of the most common element. */
12034 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
12035 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
12037 /* Insert the rest. */
12038 for (int i
= 0; i
< n_elts
; i
++)
12040 rtx x
= XVECEXP (vals
, 0, i
);
12041 if (matches
[i
][0] == maxelement
)
12043 x
= copy_to_mode_reg (inner_mode
, x
);
12044 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
12049 /* Initialise a vector which is part-variable. We want to first try
12050 to build those lanes which are constant in the most efficient way we
12052 if (n_var
!= n_elts
)
12054 rtx copy
= copy_rtx (vals
);
12056 /* Load constant part of vector. We really don't care what goes into the
12057 parts we will overwrite, but we're more likely to be able to load the
12058 constant efficiently if it has fewer, larger, repeating parts
12059 (see aarch64_simd_valid_immediate). */
12060 for (int i
= 0; i
< n_elts
; i
++)
12062 rtx x
= XVECEXP (vals
, 0, i
);
12063 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
12065 rtx subst
= any_const
;
12066 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
12068 /* Look in the copied vector, as more elements are const. */
12069 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
12070 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
12076 XVECEXP (copy
, 0, i
) = subst
;
12078 aarch64_expand_vector_init (target
, copy
);
12081 /* Insert the variable lanes directly. */
12082 for (int i
= 0; i
< n_elts
; i
++)
12084 rtx x
= XVECEXP (vals
, 0, i
);
12085 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
12087 x
= copy_to_mode_reg (inner_mode
, x
);
12088 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
12092 static unsigned HOST_WIDE_INT
12093 aarch64_shift_truncation_mask (machine_mode mode
)
12096 (!SHIFT_COUNT_TRUNCATED
12097 || aarch64_vector_mode_supported_p (mode
)
12098 || aarch64_vect_struct_mode_p (mode
)) ? 0 : (GET_MODE_BITSIZE (mode
) - 1);
12101 /* Select a format to encode pointers in exception handling data. */
12103 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
12106 switch (aarch64_cmodel
)
12108 case AARCH64_CMODEL_TINY
:
12109 case AARCH64_CMODEL_TINY_PIC
:
12110 case AARCH64_CMODEL_SMALL
:
12111 case AARCH64_CMODEL_SMALL_PIC
:
12112 case AARCH64_CMODEL_SMALL_SPIC
:
12113 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12115 type
= DW_EH_PE_sdata4
;
12118 /* No assumptions here. 8-byte relocs required. */
12119 type
= DW_EH_PE_sdata8
;
12122 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
12125 /* The last .arch and .tune assembly strings that we printed. */
12126 static std::string aarch64_last_printed_arch_string
;
12127 static std::string aarch64_last_printed_tune_string
;
12129 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12130 by the function fndecl. */
12133 aarch64_declare_function_name (FILE *stream
, const char* name
,
12136 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12138 struct cl_target_option
*targ_options
;
12140 targ_options
= TREE_TARGET_OPTION (target_parts
);
12142 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
12143 gcc_assert (targ_options
);
12145 const struct processor
*this_arch
12146 = aarch64_get_arch (targ_options
->x_explicit_arch
);
12148 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
12149 std::string extension
12150 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
12152 /* Only update the assembler .arch string if it is distinct from the last
12153 such string we printed. */
12154 std::string to_print
= this_arch
->name
+ extension
;
12155 if (to_print
!= aarch64_last_printed_arch_string
)
12157 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
12158 aarch64_last_printed_arch_string
= to_print
;
12161 /* Print the cpu name we're tuning for in the comments, might be
12162 useful to readers of the generated asm. Do it only when it changes
12163 from function to function and verbose assembly is requested. */
12164 const struct processor
*this_tune
12165 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
12167 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
12169 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
12171 aarch64_last_printed_tune_string
= this_tune
->name
;
12174 /* Don't forget the type directive for ELF. */
12175 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
12176 ASM_OUTPUT_LABEL (stream
, name
);
12179 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12182 aarch64_start_file (void)
12184 struct cl_target_option
*default_options
12185 = TREE_TARGET_OPTION (target_option_default_node
);
12187 const struct processor
*default_arch
12188 = aarch64_get_arch (default_options
->x_explicit_arch
);
12189 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
12190 std::string extension
12191 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
12192 default_arch
->flags
);
12194 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
12195 aarch64_last_printed_tune_string
= "";
12196 asm_fprintf (asm_out_file
, "\t.arch %s\n",
12197 aarch64_last_printed_arch_string
.c_str ());
12199 default_file_start ();
12202 /* Emit load exclusive. */
12205 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
12206 rtx mem
, rtx model_rtx
)
12208 rtx (*gen
) (rtx
, rtx
, rtx
);
12212 case QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
12213 case HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
12214 case SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
12215 case DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
12217 gcc_unreachable ();
12220 emit_insn (gen (rval
, mem
, model_rtx
));
12223 /* Emit store exclusive. */
12226 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
12227 rtx rval
, rtx mem
, rtx model_rtx
)
12229 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12233 case QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
12234 case HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
12235 case SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
12236 case DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
12238 gcc_unreachable ();
12241 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
12244 /* Mark the previous jump instruction as unlikely. */
12247 aarch64_emit_unlikely_jump (rtx insn
)
12249 rtx_insn
*jump
= emit_jump_insn (insn
);
12250 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
12253 /* Expand a compare and swap pattern. */
12256 aarch64_expand_compare_and_swap (rtx operands
[])
12258 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
12259 machine_mode mode
, cmp_mode
;
12260 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12263 const gen_cas_fn split_cas
[] =
12265 gen_aarch64_compare_and_swapqi
,
12266 gen_aarch64_compare_and_swaphi
,
12267 gen_aarch64_compare_and_swapsi
,
12268 gen_aarch64_compare_and_swapdi
12270 const gen_cas_fn atomic_cas
[] =
12272 gen_aarch64_compare_and_swapqi_lse
,
12273 gen_aarch64_compare_and_swaphi_lse
,
12274 gen_aarch64_compare_and_swapsi_lse
,
12275 gen_aarch64_compare_and_swapdi_lse
12278 bval
= operands
[0];
12279 rval
= operands
[1];
12281 oldval
= operands
[3];
12282 newval
= operands
[4];
12283 is_weak
= operands
[5];
12284 mod_s
= operands
[6];
12285 mod_f
= operands
[7];
12286 mode
= GET_MODE (mem
);
12289 /* Normally the succ memory model must be stronger than fail, but in the
12290 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12291 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12293 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
12294 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
12295 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
12301 /* For short modes, we're going to perform the comparison in SImode,
12302 so do the zero-extension now. */
12304 rval
= gen_reg_rtx (SImode
);
12305 oldval
= convert_modes (SImode
, mode
, oldval
, true);
12306 /* Fall through. */
12310 /* Force the value into a register if needed. */
12311 if (!aarch64_plus_operand (oldval
, mode
))
12312 oldval
= force_reg (cmp_mode
, oldval
);
12316 gcc_unreachable ();
12321 case QImode
: idx
= 0; break;
12322 case HImode
: idx
= 1; break;
12323 case SImode
: idx
= 2; break;
12324 case DImode
: idx
= 3; break;
12326 gcc_unreachable ();
12329 gen
= atomic_cas
[idx
];
12331 gen
= split_cas
[idx
];
12333 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
12335 if (mode
== QImode
|| mode
== HImode
)
12336 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
12338 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12339 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
12340 emit_insn (gen_rtx_SET (bval
, x
));
12343 /* Test whether the target supports using a atomic load-operate instruction.
12344 CODE is the operation and AFTER is TRUE if the data in memory after the
12345 operation should be returned and FALSE if the data before the operation
12346 should be returned. Returns FALSE if the operation isn't supported by the
12350 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
12369 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12370 sequence implementing an atomic operation. */
12373 aarch64_emit_post_barrier (enum memmodel model
)
12375 const enum memmodel base_model
= memmodel_base (model
);
12377 if (is_mm_sync (model
)
12378 && (base_model
== MEMMODEL_ACQUIRE
12379 || base_model
== MEMMODEL_ACQ_REL
12380 || base_model
== MEMMODEL_SEQ_CST
))
12382 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
12386 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12387 for the data in memory. EXPECTED is the value expected to be in memory.
12388 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12389 is the memory ordering to use. */
12392 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
12393 rtx expected
, rtx desired
,
12396 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12399 mode
= GET_MODE (mem
);
12403 case QImode
: gen
= gen_aarch64_atomic_casqi
; break;
12404 case HImode
: gen
= gen_aarch64_atomic_cashi
; break;
12405 case SImode
: gen
= gen_aarch64_atomic_cassi
; break;
12406 case DImode
: gen
= gen_aarch64_atomic_casdi
; break;
12408 gcc_unreachable ();
12411 /* Move the expected value into the CAS destination register. */
12412 emit_insn (gen_rtx_SET (rval
, expected
));
12414 /* Emit the CAS. */
12415 emit_insn (gen (rval
, mem
, desired
, model
));
12417 /* Compare the expected value with the value loaded by the CAS, to establish
12418 whether the swap was made. */
12419 aarch64_gen_compare_reg (EQ
, rval
, expected
);
12422 /* Split a compare and swap pattern. */
12425 aarch64_split_compare_and_swap (rtx operands
[])
12427 rtx rval
, mem
, oldval
, newval
, scratch
;
12430 rtx_code_label
*label1
, *label2
;
12432 enum memmodel model
;
12435 rval
= operands
[0];
12437 oldval
= operands
[2];
12438 newval
= operands
[3];
12439 is_weak
= (operands
[4] != const0_rtx
);
12440 model_rtx
= operands
[5];
12441 scratch
= operands
[7];
12442 mode
= GET_MODE (mem
);
12443 model
= memmodel_from_int (INTVAL (model_rtx
));
12445 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12448 LD[A]XR rval, [mem]
12450 ST[L]XR scratch, newval, [mem]
12451 CBNZ scratch, .label1
12454 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
12459 label1
= gen_label_rtx ();
12460 emit_label (label1
);
12462 label2
= gen_label_rtx ();
12464 /* The initial load can be relaxed for a __sync operation since a final
12465 barrier will be emitted to stop code hoisting. */
12466 if (is_mm_sync (model
))
12467 aarch64_emit_load_exclusive (mode
, rval
, mem
,
12468 GEN_INT (MEMMODEL_RELAXED
));
12470 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
12474 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
12475 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12476 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
12477 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12481 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
12482 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12483 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12484 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
12485 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12488 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
12492 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
12493 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12494 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
12495 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12499 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12500 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
12501 emit_insn (gen_rtx_SET (cond
, x
));
12504 emit_label (label2
);
12505 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12506 to set the condition flags. If this is not used it will be removed by
12510 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12511 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
12512 emit_insn (gen_rtx_SET (cond
, x
));
12514 /* Emit any final barrier needed for a __sync operation. */
12515 if (is_mm_sync (model
))
12516 aarch64_emit_post_barrier (model
);
12519 /* Emit a BIC instruction. */
12522 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
12524 rtx shift_rtx
= GEN_INT (shift
);
12525 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12529 case SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
12530 case DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
12532 gcc_unreachable ();
12535 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
12538 /* Emit an atomic swap. */
12541 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
12542 rtx mem
, rtx model
)
12544 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12548 case QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
12549 case HImode
: gen
= gen_aarch64_atomic_swphi
; break;
12550 case SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
12551 case DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
12553 gcc_unreachable ();
12556 emit_insn (gen (dst
, mem
, value
, model
));
12559 /* Operations supported by aarch64_emit_atomic_load_op. */
12561 enum aarch64_atomic_load_op_code
12563 AARCH64_LDOP_PLUS
, /* A + B */
12564 AARCH64_LDOP_XOR
, /* A ^ B */
12565 AARCH64_LDOP_OR
, /* A | B */
12566 AARCH64_LDOP_BIC
/* A & ~B */
12569 /* Emit an atomic load-operate. */
12572 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
12573 machine_mode mode
, rtx dst
, rtx src
,
12574 rtx mem
, rtx model
)
12576 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
12577 const aarch64_atomic_load_op_fn plus
[] =
12579 gen_aarch64_atomic_loadaddqi
,
12580 gen_aarch64_atomic_loadaddhi
,
12581 gen_aarch64_atomic_loadaddsi
,
12582 gen_aarch64_atomic_loadadddi
12584 const aarch64_atomic_load_op_fn eor
[] =
12586 gen_aarch64_atomic_loadeorqi
,
12587 gen_aarch64_atomic_loadeorhi
,
12588 gen_aarch64_atomic_loadeorsi
,
12589 gen_aarch64_atomic_loadeordi
12591 const aarch64_atomic_load_op_fn ior
[] =
12593 gen_aarch64_atomic_loadsetqi
,
12594 gen_aarch64_atomic_loadsethi
,
12595 gen_aarch64_atomic_loadsetsi
,
12596 gen_aarch64_atomic_loadsetdi
12598 const aarch64_atomic_load_op_fn bic
[] =
12600 gen_aarch64_atomic_loadclrqi
,
12601 gen_aarch64_atomic_loadclrhi
,
12602 gen_aarch64_atomic_loadclrsi
,
12603 gen_aarch64_atomic_loadclrdi
12605 aarch64_atomic_load_op_fn gen
;
12610 case QImode
: idx
= 0; break;
12611 case HImode
: idx
= 1; break;
12612 case SImode
: idx
= 2; break;
12613 case DImode
: idx
= 3; break;
12615 gcc_unreachable ();
12620 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
12621 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
12622 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
12623 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
12625 gcc_unreachable ();
12628 emit_insn (gen (dst
, mem
, src
, model
));
12631 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12632 location to store the data read from memory. OUT_RESULT is the location to
12633 store the result of the operation. MEM is the memory location to read and
12634 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12635 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12639 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
12640 rtx mem
, rtx value
, rtx model_rtx
)
12642 machine_mode mode
= GET_MODE (mem
);
12643 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
12644 const bool short_mode
= (mode
< SImode
);
12645 aarch64_atomic_load_op_code ldop_code
;
12650 out_data
= gen_lowpart (mode
, out_data
);
12653 out_result
= gen_lowpart (mode
, out_result
);
12655 /* Make sure the value is in a register, putting it into a destination
12656 register if it needs to be manipulated. */
12657 if (!register_operand (value
, mode
)
12658 || code
== AND
|| code
== MINUS
)
12660 src
= out_result
? out_result
: out_data
;
12661 emit_move_insn (src
, gen_lowpart (mode
, value
));
12665 gcc_assert (register_operand (src
, mode
));
12667 /* Preprocess the data for the operation as necessary. If the operation is
12668 a SET then emit a swap instruction and finish. */
12672 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
12676 /* Negate the value and treat it as a PLUS. */
12680 /* Resize the value if necessary. */
12682 src
= gen_lowpart (wmode
, src
);
12684 neg_src
= gen_rtx_NEG (wmode
, src
);
12685 emit_insn (gen_rtx_SET (src
, neg_src
));
12688 src
= gen_lowpart (mode
, src
);
12690 /* Fall-through. */
12692 ldop_code
= AARCH64_LDOP_PLUS
;
12696 ldop_code
= AARCH64_LDOP_OR
;
12700 ldop_code
= AARCH64_LDOP_XOR
;
12707 /* Resize the value if necessary. */
12709 src
= gen_lowpart (wmode
, src
);
12711 not_src
= gen_rtx_NOT (wmode
, src
);
12712 emit_insn (gen_rtx_SET (src
, not_src
));
12715 src
= gen_lowpart (mode
, src
);
12717 ldop_code
= AARCH64_LDOP_BIC
;
12721 /* The operation can't be done with atomic instructions. */
12722 gcc_unreachable ();
12725 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
12727 /* If necessary, calculate the data in memory after the update by redoing the
12728 operation from values in registers. */
12734 src
= gen_lowpart (wmode
, src
);
12735 out_data
= gen_lowpart (wmode
, out_data
);
12736 out_result
= gen_lowpart (wmode
, out_result
);
12745 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
12748 x
= gen_rtx_IOR (wmode
, out_data
, src
);
12751 x
= gen_rtx_XOR (wmode
, out_data
, src
);
12754 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
12757 gcc_unreachable ();
12760 emit_set_insn (out_result
, x
);
12765 /* Split an atomic operation. */
12768 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
12769 rtx value
, rtx model_rtx
, rtx cond
)
12771 machine_mode mode
= GET_MODE (mem
);
12772 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
12773 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
12774 const bool is_sync
= is_mm_sync (model
);
12775 rtx_code_label
*label
;
12778 /* Split the atomic operation into a sequence. */
12779 label
= gen_label_rtx ();
12780 emit_label (label
);
12783 new_out
= gen_lowpart (wmode
, new_out
);
12785 old_out
= gen_lowpart (wmode
, old_out
);
12788 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
12790 /* The initial load can be relaxed for a __sync operation since a final
12791 barrier will be emitted to stop code hoisting. */
12793 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
12794 GEN_INT (MEMMODEL_RELAXED
));
12796 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
12805 x
= gen_rtx_AND (wmode
, old_out
, value
);
12806 emit_insn (gen_rtx_SET (new_out
, x
));
12807 x
= gen_rtx_NOT (wmode
, new_out
);
12808 emit_insn (gen_rtx_SET (new_out
, x
));
12812 if (CONST_INT_P (value
))
12814 value
= GEN_INT (-INTVAL (value
));
12817 /* Fall through. */
12820 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
12821 emit_insn (gen_rtx_SET (new_out
, x
));
12825 aarch64_emit_store_exclusive (mode
, cond
, mem
,
12826 gen_lowpart (mode
, new_out
), model_rtx
);
12828 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12829 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12830 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
12831 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12833 /* Emit any final barrier needed for a __sync operation. */
12835 aarch64_emit_post_barrier (model
);
12839 aarch64_init_libfuncs (void)
12841 /* Half-precision float operations. The compiler handles all operations
12842 with NULL libfuncs by converting to SFmode. */
12845 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
12846 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
12849 set_optab_libfunc (add_optab
, HFmode
, NULL
);
12850 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
12851 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
12852 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
12853 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
12856 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
12857 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
12858 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
12859 set_optab_libfunc (le_optab
, HFmode
, NULL
);
12860 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
12861 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
12862 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
12865 /* Target hook for c_mode_for_suffix. */
12866 static machine_mode
12867 aarch64_c_mode_for_suffix (char suffix
)
12875 /* We can only represent floating point constants which will fit in
12876 "quarter-precision" values. These values are characterised by
12877 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12880 (-1)^s * (n/16) * 2^r
12883 's' is the sign bit.
12884 'n' is an integer in the range 16 <= n <= 31.
12885 'r' is an integer in the range -3 <= r <= 4. */
12887 /* Return true iff X can be represented by a quarter-precision
12888 floating point immediate operand X. Note, we cannot represent 0.0. */
12890 aarch64_float_const_representable_p (rtx x
)
12892 /* This represents our current view of how many bits
12893 make up the mantissa. */
12894 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
12896 unsigned HOST_WIDE_INT mantissa
, mask
;
12897 REAL_VALUE_TYPE r
, m
;
12900 if (!CONST_DOUBLE_P (x
))
12903 /* We don't support HFmode constants yet. */
12904 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
12907 r
= *CONST_DOUBLE_REAL_VALUE (x
);
12909 /* We cannot represent infinities, NaNs or +/-zero. We won't
12910 know if we have +zero until we analyse the mantissa, but we
12911 can reject the other invalid values. */
12912 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
12913 || REAL_VALUE_MINUS_ZERO (r
))
12916 /* Extract exponent. */
12917 r
= real_value_abs (&r
);
12918 exponent
= REAL_EXP (&r
);
12920 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12921 highest (sign) bit, with a fixed binary point at bit point_pos.
12922 m1 holds the low part of the mantissa, m2 the high part.
12923 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12924 bits for the mantissa, this can fail (low bits will be lost). */
12925 real_ldexp (&m
, &r
, point_pos
- exponent
);
12926 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
12928 /* If the low part of the mantissa has bits set we cannot represent
12930 if (w
.ulow () != 0)
12932 /* We have rejected the lower HOST_WIDE_INT, so update our
12933 understanding of how many bits lie in the mantissa and
12934 look only at the high HOST_WIDE_INT. */
12935 mantissa
= w
.elt (1);
12936 point_pos
-= HOST_BITS_PER_WIDE_INT
;
12938 /* We can only represent values with a mantissa of the form 1.xxxx. */
12939 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
12940 if ((mantissa
& mask
) != 0)
12943 /* Having filtered unrepresentable values, we may now remove all
12944 but the highest 5 bits. */
12945 mantissa
>>= point_pos
- 5;
12947 /* We cannot represent the value 0.0, so reject it. This is handled
12952 /* Then, as bit 4 is always set, we can mask it off, leaving
12953 the mantissa in the range [0, 15]. */
12954 mantissa
&= ~(1 << 4);
12955 gcc_assert (mantissa
<= 15);
12957 /* GCC internally does not use IEEE754-like encoding (where normalized
12958 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12959 Our mantissa values are shifted 4 places to the left relative to
12960 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12961 by 5 places to correct for GCC's representation. */
12962 exponent
= 5 - exponent
;
12964 return (exponent
>= 0 && exponent
<= 7);
12968 aarch64_output_simd_mov_immediate (rtx const_vector
,
12973 static char templ
[40];
12974 const char *mnemonic
;
12975 const char *shift_op
;
12976 unsigned int lane_count
= 0;
12979 struct simd_immediate_info info
= { NULL_RTX
, 0, 0, false, false };
12981 /* This will return true to show const_vector is legal for use as either
12982 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12983 also update INFO to show how the immediate should be generated. */
12984 is_valid
= aarch64_simd_valid_immediate (const_vector
, mode
, false, &info
);
12985 gcc_assert (is_valid
);
12987 element_char
= sizetochar (info
.element_width
);
12988 lane_count
= width
/ info
.element_width
;
12990 mode
= GET_MODE_INNER (mode
);
12991 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12993 gcc_assert (info
.shift
== 0 && ! info
.mvn
);
12994 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12995 move immediate path. */
12996 if (aarch64_float_const_zero_rtx_p (info
.value
))
12997 info
.value
= GEN_INT (0);
13000 const unsigned int buf_size
= 20;
13001 char float_buf
[buf_size
] = {'\0'};
13002 real_to_decimal_for_mode (float_buf
,
13003 CONST_DOUBLE_REAL_VALUE (info
.value
),
13004 buf_size
, buf_size
, 1, mode
);
13006 if (lane_count
== 1)
13007 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
13009 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
13010 lane_count
, element_char
, float_buf
);
13015 mnemonic
= info
.mvn
? "mvni" : "movi";
13016 shift_op
= info
.msl
? "msl" : "lsl";
13018 gcc_assert (CONST_INT_P (info
.value
));
13019 if (lane_count
== 1)
13020 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
13021 mnemonic
, UINTVAL (info
.value
));
13022 else if (info
.shift
)
13023 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
13024 ", %s %d", mnemonic
, lane_count
, element_char
,
13025 UINTVAL (info
.value
), shift_op
, info
.shift
);
13027 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
,
13028 mnemonic
, lane_count
, element_char
, UINTVAL (info
.value
));
13033 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, machine_mode mode
)
13036 /* If a floating point number was passed and we desire to use it in an
13037 integer mode do the conversion to integer. */
13038 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
13040 unsigned HOST_WIDE_INT ival
;
13041 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
13042 gcc_unreachable ();
13043 immediate
= gen_int_mode (ival
, mode
);
13046 machine_mode vmode
;
13047 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13048 a 128 bit vector mode. */
13049 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
13051 gcc_assert (!VECTOR_MODE_P (mode
));
13052 vmode
= aarch64_simd_container_mode (mode
, width
);
13053 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
13054 return aarch64_output_simd_mov_immediate (v_op
, vmode
, width
);
13057 /* Split operands into moves from op[1] + op[2] into op[0]. */
13060 aarch64_split_combinev16qi (rtx operands
[3])
13062 unsigned int dest
= REGNO (operands
[0]);
13063 unsigned int src1
= REGNO (operands
[1]);
13064 unsigned int src2
= REGNO (operands
[2]);
13065 machine_mode halfmode
= GET_MODE (operands
[1]);
13066 unsigned int halfregs
= HARD_REGNO_NREGS (src1
, halfmode
);
13067 rtx destlo
, desthi
;
13069 gcc_assert (halfmode
== V16QImode
);
13071 if (src1
== dest
&& src2
== dest
+ halfregs
)
13073 /* No-op move. Can't split to nothing; emit something. */
13074 emit_note (NOTE_INSN_DELETED
);
13078 /* Preserve register attributes for variable tracking. */
13079 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
13080 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
13081 GET_MODE_SIZE (halfmode
));
13083 /* Special case of reversed high/low parts. */
13084 if (reg_overlap_mentioned_p (operands
[2], destlo
)
13085 && reg_overlap_mentioned_p (operands
[1], desthi
))
13087 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
13088 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
13089 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
13091 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
13093 /* Try to avoid unnecessary moves if part of the result
13094 is in the right place already. */
13096 emit_move_insn (destlo
, operands
[1]);
13097 if (src2
!= dest
+ halfregs
)
13098 emit_move_insn (desthi
, operands
[2]);
13102 if (src2
!= dest
+ halfregs
)
13103 emit_move_insn (desthi
, operands
[2]);
13105 emit_move_insn (destlo
, operands
[1]);
13109 /* vec_perm support. */
13111 #define MAX_VECT_LEN 16
13113 struct expand_vec_perm_d
13115 rtx target
, op0
, op1
;
13116 unsigned char perm
[MAX_VECT_LEN
];
13117 machine_mode vmode
;
13118 unsigned char nelt
;
13123 /* Generate a variable permutation. */
13126 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13128 machine_mode vmode
= GET_MODE (target
);
13129 bool one_vector_p
= rtx_equal_p (op0
, op1
);
13131 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
13132 gcc_checking_assert (GET_MODE (op0
) == vmode
);
13133 gcc_checking_assert (GET_MODE (op1
) == vmode
);
13134 gcc_checking_assert (GET_MODE (sel
) == vmode
);
13135 gcc_checking_assert (TARGET_SIMD
);
13139 if (vmode
== V8QImode
)
13141 /* Expand the argument to a V16QI mode by duplicating it. */
13142 rtx pair
= gen_reg_rtx (V16QImode
);
13143 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
13144 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
13148 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
13155 if (vmode
== V8QImode
)
13157 pair
= gen_reg_rtx (V16QImode
);
13158 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
13159 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
13163 pair
= gen_reg_rtx (OImode
);
13164 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
13165 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
13171 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13173 machine_mode vmode
= GET_MODE (target
);
13174 unsigned int nelt
= GET_MODE_NUNITS (vmode
);
13175 bool one_vector_p
= rtx_equal_p (op0
, op1
);
13178 /* The TBL instruction does not use a modulo index, so we must take care
13179 of that ourselves. */
13180 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
13181 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13182 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
13184 /* For big-endian, we also need to reverse the index within the vector
13185 (but not which vector). */
13186 if (BYTES_BIG_ENDIAN
)
13188 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13190 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
13191 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
13192 NULL
, 0, OPTAB_LIB_WIDEN
);
13194 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
13197 /* Recognize patterns suitable for the TRN instructions. */
13199 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
13201 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
13202 rtx out
, in0
, in1
, x
;
13203 rtx (*gen
) (rtx
, rtx
, rtx
);
13204 machine_mode vmode
= d
->vmode
;
13206 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13209 /* Note that these are little-endian tests.
13210 We correct for big-endian later. */
13211 if (d
->perm
[0] == 0)
13213 else if (d
->perm
[0] == 1)
13217 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13219 for (i
= 0; i
< nelt
; i
+= 2)
13221 if (d
->perm
[i
] != i
+ odd
)
13223 if (d
->perm
[i
+ 1] != ((i
+ nelt
+ odd
) & mask
))
13233 if (BYTES_BIG_ENDIAN
)
13235 x
= in0
, in0
= in1
, in1
= x
;
13244 case V16QImode
: gen
= gen_aarch64_trn2v16qi
; break;
13245 case V8QImode
: gen
= gen_aarch64_trn2v8qi
; break;
13246 case V8HImode
: gen
= gen_aarch64_trn2v8hi
; break;
13247 case V4HImode
: gen
= gen_aarch64_trn2v4hi
; break;
13248 case V4SImode
: gen
= gen_aarch64_trn2v4si
; break;
13249 case V2SImode
: gen
= gen_aarch64_trn2v2si
; break;
13250 case V2DImode
: gen
= gen_aarch64_trn2v2di
; break;
13251 case V4HFmode
: gen
= gen_aarch64_trn2v4hf
; break;
13252 case V8HFmode
: gen
= gen_aarch64_trn2v8hf
; break;
13253 case V4SFmode
: gen
= gen_aarch64_trn2v4sf
; break;
13254 case V2SFmode
: gen
= gen_aarch64_trn2v2sf
; break;
13255 case V2DFmode
: gen
= gen_aarch64_trn2v2df
; break;
13264 case V16QImode
: gen
= gen_aarch64_trn1v16qi
; break;
13265 case V8QImode
: gen
= gen_aarch64_trn1v8qi
; break;
13266 case V8HImode
: gen
= gen_aarch64_trn1v8hi
; break;
13267 case V4HImode
: gen
= gen_aarch64_trn1v4hi
; break;
13268 case V4SImode
: gen
= gen_aarch64_trn1v4si
; break;
13269 case V2SImode
: gen
= gen_aarch64_trn1v2si
; break;
13270 case V2DImode
: gen
= gen_aarch64_trn1v2di
; break;
13271 case V4HFmode
: gen
= gen_aarch64_trn1v4hf
; break;
13272 case V8HFmode
: gen
= gen_aarch64_trn1v8hf
; break;
13273 case V4SFmode
: gen
= gen_aarch64_trn1v4sf
; break;
13274 case V2SFmode
: gen
= gen_aarch64_trn1v2sf
; break;
13275 case V2DFmode
: gen
= gen_aarch64_trn1v2df
; break;
13281 emit_insn (gen (out
, in0
, in1
));
13285 /* Recognize patterns suitable for the UZP instructions. */
13287 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
13289 unsigned int i
, odd
, mask
, nelt
= d
->nelt
;
13290 rtx out
, in0
, in1
, x
;
13291 rtx (*gen
) (rtx
, rtx
, rtx
);
13292 machine_mode vmode
= d
->vmode
;
13294 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13297 /* Note that these are little-endian tests.
13298 We correct for big-endian later. */
13299 if (d
->perm
[0] == 0)
13301 else if (d
->perm
[0] == 1)
13305 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13307 for (i
= 0; i
< nelt
; i
++)
13309 unsigned elt
= (i
* 2 + odd
) & mask
;
13310 if (d
->perm
[i
] != elt
)
13320 if (BYTES_BIG_ENDIAN
)
13322 x
= in0
, in0
= in1
, in1
= x
;
13331 case V16QImode
: gen
= gen_aarch64_uzp2v16qi
; break;
13332 case V8QImode
: gen
= gen_aarch64_uzp2v8qi
; break;
13333 case V8HImode
: gen
= gen_aarch64_uzp2v8hi
; break;
13334 case V4HImode
: gen
= gen_aarch64_uzp2v4hi
; break;
13335 case V4SImode
: gen
= gen_aarch64_uzp2v4si
; break;
13336 case V2SImode
: gen
= gen_aarch64_uzp2v2si
; break;
13337 case V2DImode
: gen
= gen_aarch64_uzp2v2di
; break;
13338 case V4HFmode
: gen
= gen_aarch64_uzp2v4hf
; break;
13339 case V8HFmode
: gen
= gen_aarch64_uzp2v8hf
; break;
13340 case V4SFmode
: gen
= gen_aarch64_uzp2v4sf
; break;
13341 case V2SFmode
: gen
= gen_aarch64_uzp2v2sf
; break;
13342 case V2DFmode
: gen
= gen_aarch64_uzp2v2df
; break;
13351 case V16QImode
: gen
= gen_aarch64_uzp1v16qi
; break;
13352 case V8QImode
: gen
= gen_aarch64_uzp1v8qi
; break;
13353 case V8HImode
: gen
= gen_aarch64_uzp1v8hi
; break;
13354 case V4HImode
: gen
= gen_aarch64_uzp1v4hi
; break;
13355 case V4SImode
: gen
= gen_aarch64_uzp1v4si
; break;
13356 case V2SImode
: gen
= gen_aarch64_uzp1v2si
; break;
13357 case V2DImode
: gen
= gen_aarch64_uzp1v2di
; break;
13358 case V4HFmode
: gen
= gen_aarch64_uzp1v4hf
; break;
13359 case V8HFmode
: gen
= gen_aarch64_uzp1v8hf
; break;
13360 case V4SFmode
: gen
= gen_aarch64_uzp1v4sf
; break;
13361 case V2SFmode
: gen
= gen_aarch64_uzp1v2sf
; break;
13362 case V2DFmode
: gen
= gen_aarch64_uzp1v2df
; break;
13368 emit_insn (gen (out
, in0
, in1
));
13372 /* Recognize patterns suitable for the ZIP instructions. */
13374 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
13376 unsigned int i
, high
, mask
, nelt
= d
->nelt
;
13377 rtx out
, in0
, in1
, x
;
13378 rtx (*gen
) (rtx
, rtx
, rtx
);
13379 machine_mode vmode
= d
->vmode
;
13381 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13384 /* Note that these are little-endian tests.
13385 We correct for big-endian later. */
13387 if (d
->perm
[0] == high
)
13390 else if (d
->perm
[0] == 0)
13394 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13396 for (i
= 0; i
< nelt
/ 2; i
++)
13398 unsigned elt
= (i
+ high
) & mask
;
13399 if (d
->perm
[i
* 2] != elt
)
13401 elt
= (elt
+ nelt
) & mask
;
13402 if (d
->perm
[i
* 2 + 1] != elt
)
13412 if (BYTES_BIG_ENDIAN
)
13414 x
= in0
, in0
= in1
, in1
= x
;
13423 case V16QImode
: gen
= gen_aarch64_zip2v16qi
; break;
13424 case V8QImode
: gen
= gen_aarch64_zip2v8qi
; break;
13425 case V8HImode
: gen
= gen_aarch64_zip2v8hi
; break;
13426 case V4HImode
: gen
= gen_aarch64_zip2v4hi
; break;
13427 case V4SImode
: gen
= gen_aarch64_zip2v4si
; break;
13428 case V2SImode
: gen
= gen_aarch64_zip2v2si
; break;
13429 case V2DImode
: gen
= gen_aarch64_zip2v2di
; break;
13430 case V4HFmode
: gen
= gen_aarch64_zip2v4hf
; break;
13431 case V8HFmode
: gen
= gen_aarch64_zip2v8hf
; break;
13432 case V4SFmode
: gen
= gen_aarch64_zip2v4sf
; break;
13433 case V2SFmode
: gen
= gen_aarch64_zip2v2sf
; break;
13434 case V2DFmode
: gen
= gen_aarch64_zip2v2df
; break;
13443 case V16QImode
: gen
= gen_aarch64_zip1v16qi
; break;
13444 case V8QImode
: gen
= gen_aarch64_zip1v8qi
; break;
13445 case V8HImode
: gen
= gen_aarch64_zip1v8hi
; break;
13446 case V4HImode
: gen
= gen_aarch64_zip1v4hi
; break;
13447 case V4SImode
: gen
= gen_aarch64_zip1v4si
; break;
13448 case V2SImode
: gen
= gen_aarch64_zip1v2si
; break;
13449 case V2DImode
: gen
= gen_aarch64_zip1v2di
; break;
13450 case V4HFmode
: gen
= gen_aarch64_zip1v4hf
; break;
13451 case V8HFmode
: gen
= gen_aarch64_zip1v8hf
; break;
13452 case V4SFmode
: gen
= gen_aarch64_zip1v4sf
; break;
13453 case V2SFmode
: gen
= gen_aarch64_zip1v2sf
; break;
13454 case V2DFmode
: gen
= gen_aarch64_zip1v2df
; break;
13460 emit_insn (gen (out
, in0
, in1
));
13464 /* Recognize patterns for the EXT insn. */
13467 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
13469 unsigned int i
, nelt
= d
->nelt
;
13470 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
13473 unsigned int location
= d
->perm
[0]; /* Always < nelt. */
13475 /* Check if the extracted indices are increasing by one. */
13476 for (i
= 1; i
< nelt
; i
++)
13478 unsigned int required
= location
+ i
;
13479 if (d
->one_vector_p
)
13481 /* We'll pass the same vector in twice, so allow indices to wrap. */
13482 required
&= (nelt
- 1);
13484 if (d
->perm
[i
] != required
)
13490 case V16QImode
: gen
= gen_aarch64_extv16qi
; break;
13491 case V8QImode
: gen
= gen_aarch64_extv8qi
; break;
13492 case V4HImode
: gen
= gen_aarch64_extv4hi
; break;
13493 case V8HImode
: gen
= gen_aarch64_extv8hi
; break;
13494 case V2SImode
: gen
= gen_aarch64_extv2si
; break;
13495 case V4SImode
: gen
= gen_aarch64_extv4si
; break;
13496 case V4HFmode
: gen
= gen_aarch64_extv4hf
; break;
13497 case V8HFmode
: gen
= gen_aarch64_extv8hf
; break;
13498 case V2SFmode
: gen
= gen_aarch64_extv2sf
; break;
13499 case V4SFmode
: gen
= gen_aarch64_extv4sf
; break;
13500 case V2DImode
: gen
= gen_aarch64_extv2di
; break;
13501 case V2DFmode
: gen
= gen_aarch64_extv2df
; break;
13510 /* The case where (location == 0) is a no-op for both big- and little-endian,
13511 and is removed by the mid-end at optimization levels -O1 and higher. */
13513 if (BYTES_BIG_ENDIAN
&& (location
!= 0))
13515 /* After setup, we want the high elements of the first vector (stored
13516 at the LSB end of the register), and the low elements of the second
13517 vector (stored at the MSB end of the register). So swap. */
13518 std::swap (d
->op0
, d
->op1
);
13519 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13520 location
= nelt
- location
;
13523 offset
= GEN_INT (location
);
13524 emit_insn (gen (d
->target
, d
->op0
, d
->op1
, offset
));
13528 /* Recognize patterns for the REV insns. */
13531 aarch64_evpc_rev (struct expand_vec_perm_d
*d
)
13533 unsigned int i
, j
, diff
, nelt
= d
->nelt
;
13534 rtx (*gen
) (rtx
, rtx
);
13536 if (!d
->one_vector_p
)
13545 case V16QImode
: gen
= gen_aarch64_rev64v16qi
; break;
13546 case V8QImode
: gen
= gen_aarch64_rev64v8qi
; break;
13554 case V16QImode
: gen
= gen_aarch64_rev32v16qi
; break;
13555 case V8QImode
: gen
= gen_aarch64_rev32v8qi
; break;
13556 case V8HImode
: gen
= gen_aarch64_rev64v8hi
; break;
13557 case V4HImode
: gen
= gen_aarch64_rev64v4hi
; break;
13565 case V16QImode
: gen
= gen_aarch64_rev16v16qi
; break;
13566 case V8QImode
: gen
= gen_aarch64_rev16v8qi
; break;
13567 case V8HImode
: gen
= gen_aarch64_rev32v8hi
; break;
13568 case V4HImode
: gen
= gen_aarch64_rev32v4hi
; break;
13569 case V4SImode
: gen
= gen_aarch64_rev64v4si
; break;
13570 case V2SImode
: gen
= gen_aarch64_rev64v2si
; break;
13571 case V4SFmode
: gen
= gen_aarch64_rev64v4sf
; break;
13572 case V2SFmode
: gen
= gen_aarch64_rev64v2sf
; break;
13573 case V8HFmode
: gen
= gen_aarch64_rev64v8hf
; break;
13574 case V4HFmode
: gen
= gen_aarch64_rev64v4hf
; break;
13583 for (i
= 0; i
< nelt
; i
+= diff
+ 1)
13584 for (j
= 0; j
<= diff
; j
+= 1)
13586 /* This is guaranteed to be true as the value of diff
13587 is 7, 3, 1 and we should have enough elements in the
13588 queue to generate this. Getting a vector mask with a
13589 value of diff other than these values implies that
13590 something is wrong by the time we get here. */
13591 gcc_assert (i
+ j
< nelt
);
13592 if (d
->perm
[i
+ j
] != i
+ diff
- j
)
13600 emit_insn (gen (d
->target
, d
->op0
));
13605 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
13607 rtx (*gen
) (rtx
, rtx
, rtx
);
13608 rtx out
= d
->target
;
13610 machine_mode vmode
= d
->vmode
;
13611 unsigned int i
, elt
, nelt
= d
->nelt
;
13615 for (i
= 1; i
< nelt
; i
++)
13617 if (elt
!= d
->perm
[i
])
13621 /* The generic preparation in aarch64_expand_vec_perm_const_1
13622 swaps the operand order and the permute indices if it finds
13623 d->perm[0] to be in the second operand. Thus, we can always
13624 use d->op0 and need not do any extra arithmetic to get the
13625 correct lane number. */
13627 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
13631 case V16QImode
: gen
= gen_aarch64_dup_lanev16qi
; break;
13632 case V8QImode
: gen
= gen_aarch64_dup_lanev8qi
; break;
13633 case V8HImode
: gen
= gen_aarch64_dup_lanev8hi
; break;
13634 case V4HImode
: gen
= gen_aarch64_dup_lanev4hi
; break;
13635 case V4SImode
: gen
= gen_aarch64_dup_lanev4si
; break;
13636 case V2SImode
: gen
= gen_aarch64_dup_lanev2si
; break;
13637 case V2DImode
: gen
= gen_aarch64_dup_lanev2di
; break;
13638 case V8HFmode
: gen
= gen_aarch64_dup_lanev8hf
; break;
13639 case V4HFmode
: gen
= gen_aarch64_dup_lanev4hf
; break;
13640 case V4SFmode
: gen
= gen_aarch64_dup_lanev4sf
; break;
13641 case V2SFmode
: gen
= gen_aarch64_dup_lanev2sf
; break;
13642 case V2DFmode
: gen
= gen_aarch64_dup_lanev2df
; break;
13647 emit_insn (gen (out
, in0
, lane
));
13652 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
13654 rtx rperm
[MAX_VECT_LEN
], sel
;
13655 machine_mode vmode
= d
->vmode
;
13656 unsigned int i
, nelt
= d
->nelt
;
13661 /* Generic code will try constant permutation twice. Once with the
13662 original mode and again with the elements lowered to QImode.
13663 So wait and don't do the selector expansion ourselves. */
13664 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
13667 for (i
= 0; i
< nelt
; ++i
)
13669 int nunits
= GET_MODE_NUNITS (vmode
);
13671 /* If big-endian and two vectors we end up with a weird mixed-endian
13672 mode on NEON. Reverse the index within each word but not the word
13674 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? d
->perm
[i
] ^ (nunits
- 1)
13677 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
13678 sel
= force_reg (vmode
, sel
);
13680 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
13685 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
13687 /* The pattern matching functions above are written to look for a small
13688 number to begin the sequence (0, 1, N/2). If we begin with an index
13689 from the second operand, we can swap the operands. */
13690 if (d
->perm
[0] >= d
->nelt
)
13692 unsigned i
, nelt
= d
->nelt
;
13694 gcc_assert (nelt
== (nelt
& -nelt
));
13695 for (i
= 0; i
< nelt
; ++i
)
13696 d
->perm
[i
] ^= nelt
; /* Keep the same index, but in the other vector. */
13698 std::swap (d
->op0
, d
->op1
);
13703 if (aarch64_evpc_rev (d
))
13705 else if (aarch64_evpc_ext (d
))
13707 else if (aarch64_evpc_dup (d
))
13709 else if (aarch64_evpc_zip (d
))
13711 else if (aarch64_evpc_uzp (d
))
13713 else if (aarch64_evpc_trn (d
))
13715 return aarch64_evpc_tbl (d
);
13720 /* Expand a vec_perm_const pattern. */
13723 aarch64_expand_vec_perm_const (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13725 struct expand_vec_perm_d d
;
13726 int i
, nelt
, which
;
13732 d
.vmode
= GET_MODE (target
);
13733 gcc_assert (VECTOR_MODE_P (d
.vmode
));
13734 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
13735 d
.testing_p
= false;
13737 for (i
= which
= 0; i
< nelt
; ++i
)
13739 rtx e
= XVECEXP (sel
, 0, i
);
13740 int ei
= INTVAL (e
) & (2 * nelt
- 1);
13741 which
|= (ei
< nelt
? 1 : 2);
13748 gcc_unreachable ();
13751 d
.one_vector_p
= false;
13752 if (!rtx_equal_p (op0
, op1
))
13755 /* The elements of PERM do not suggest that only the first operand
13756 is used, but both operands are identical. Allow easier matching
13757 of the permutation by folding the permutation into the single
13759 /* Fall Through. */
13761 for (i
= 0; i
< nelt
; ++i
)
13762 d
.perm
[i
] &= nelt
- 1;
13764 d
.one_vector_p
= true;
13769 d
.one_vector_p
= true;
13773 return aarch64_expand_vec_perm_const_1 (&d
);
13777 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
,
13778 const unsigned char *sel
)
13780 struct expand_vec_perm_d d
;
13781 unsigned int i
, nelt
, which
;
13785 d
.nelt
= nelt
= GET_MODE_NUNITS (d
.vmode
);
13786 d
.testing_p
= true;
13787 memcpy (d
.perm
, sel
, nelt
);
13789 /* Calculate whether all elements are in one vector. */
13790 for (i
= which
= 0; i
< nelt
; ++i
)
13792 unsigned char e
= d
.perm
[i
];
13793 gcc_assert (e
< 2 * nelt
);
13794 which
|= (e
< nelt
? 1 : 2);
13797 /* If all elements are from the second vector, reindex as if from the
13800 for (i
= 0; i
< nelt
; ++i
)
13803 /* Check whether the mask can be applied to a single vector. */
13804 d
.one_vector_p
= (which
!= 3);
13806 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
13807 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
13808 if (!d
.one_vector_p
)
13809 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
13812 ret
= aarch64_expand_vec_perm_const_1 (&d
);
13819 aarch64_reverse_mask (machine_mode mode
)
13821 /* We have to reverse each vector because we dont have
13822 a permuted load that can reverse-load according to ABI rules. */
13824 rtvec v
= rtvec_alloc (16);
13826 int nunits
= GET_MODE_NUNITS (mode
);
13827 int usize
= GET_MODE_UNIT_SIZE (mode
);
13829 gcc_assert (BYTES_BIG_ENDIAN
);
13830 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
13832 for (i
= 0; i
< nunits
; i
++)
13833 for (j
= 0; j
< usize
; j
++)
13834 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
13835 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
13836 return force_reg (V16QImode
, mask
);
13839 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13840 However due to issues with register allocation it is preferable to avoid
13841 tieing integer scalar and FP scalar modes. Executing integer operations
13842 in general registers is better than treating them as scalar vector
13843 operations. This reduces latency and avoids redundant int<->FP moves.
13844 So tie modes if they are either the same class, or vector modes with
13845 other vector modes, vector structs or any scalar mode.
13849 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
13851 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
13854 /* We specifically want to allow elements of "structure" modes to
13855 be tieable to the structure. This more general condition allows
13856 other rarer situations too. */
13857 if (aarch64_vector_mode_p (mode1
) && aarch64_vector_mode_p (mode2
))
13860 /* Also allow any scalar modes with vectors. */
13861 if (aarch64_vector_mode_supported_p (mode1
)
13862 || aarch64_vector_mode_supported_p (mode2
))
13868 /* Return a new RTX holding the result of moving POINTER forward by
13872 aarch64_move_pointer (rtx pointer
, int amount
)
13874 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
13876 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
13880 /* Return a new RTX holding the result of moving POINTER forward by the
13881 size of the mode it points to. */
13884 aarch64_progress_pointer (rtx pointer
)
13886 HOST_WIDE_INT amount
= GET_MODE_SIZE (GET_MODE (pointer
));
13888 return aarch64_move_pointer (pointer
, amount
);
13891 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13895 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
13898 rtx reg
= gen_reg_rtx (mode
);
13900 /* "Cast" the pointers to the correct mode. */
13901 *src
= adjust_address (*src
, mode
, 0);
13902 *dst
= adjust_address (*dst
, mode
, 0);
13903 /* Emit the memcpy. */
13904 emit_move_insn (reg
, *src
);
13905 emit_move_insn (*dst
, reg
);
13906 /* Move the pointers forward. */
13907 *src
= aarch64_progress_pointer (*src
);
13908 *dst
= aarch64_progress_pointer (*dst
);
13911 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13912 we succeed, otherwise return false. */
13915 aarch64_expand_movmem (rtx
*operands
)
13918 rtx dst
= operands
[0];
13919 rtx src
= operands
[1];
13921 bool speed_p
= !optimize_function_for_size_p (cfun
);
13923 /* When optimizing for size, give a better estimate of the length of a
13924 memcpy call, but use the default otherwise. */
13925 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
13927 /* We can't do anything smart if the amount to copy is not constant. */
13928 if (!CONST_INT_P (operands
[2]))
13931 n
= UINTVAL (operands
[2]);
13933 /* Try to keep the number of instructions low. For cases below 16 bytes we
13934 need to make at most two moves. For cases above 16 bytes it will be one
13935 move for each 16 byte chunk, then at most two additional moves. */
13936 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
13939 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
13940 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
13942 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
13943 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
13945 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13951 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
13956 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
13961 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13962 4-byte chunk, partially overlapping with the previously copied chunk. */
13965 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13971 src
= aarch64_move_pointer (src
, move
);
13972 dst
= aarch64_move_pointer (dst
, move
);
13973 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
13978 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13979 them, then (if applicable) an 8-byte chunk. */
13984 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
13989 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
13994 /* Finish the final bytes of the copy. We can always do this in one
13995 instruction. We either copy the exact amount we need, or partially
13996 overlap with the previous chunk we copied and copy 8-bytes. */
14000 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
14002 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
14004 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
14009 src
= aarch64_move_pointer (src
, -1);
14010 dst
= aarch64_move_pointer (dst
, -1);
14011 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
14017 src
= aarch64_move_pointer (src
, move
);
14018 dst
= aarch64_move_pointer (dst
, move
);
14019 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
14026 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14027 SImode stores. Handle the case when the constant has identical
14028 bottom and top halves. This is beneficial when the two stores can be
14029 merged into an STP and we avoid synthesising potentially expensive
14030 immediates twice. Return true if such a split is possible. */
14033 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
14035 rtx lo
= gen_lowpart (SImode
, src
);
14036 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
14038 bool size_p
= optimize_function_for_size_p (cfun
);
14040 if (!rtx_equal_p (lo
, hi
))
14043 unsigned int orig_cost
14044 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
14045 unsigned int lo_cost
14046 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
14048 /* We want to transform:
14050 MOVK x1, 0x140, lsl 16
14051 MOVK x1, 0xc0da, lsl 32
14052 MOVK x1, 0x140, lsl 48
14056 MOVK w1, 0x140, lsl 16
14058 So we want to perform this only when we save two instructions
14059 or more. When optimizing for size, however, accept any code size
14061 if (size_p
&& orig_cost
<= lo_cost
)
14065 && (orig_cost
<= lo_cost
+ 1))
14068 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
14069 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
14072 rtx tmp_reg
= gen_reg_rtx (SImode
);
14073 aarch64_expand_mov_immediate (tmp_reg
, lo
);
14074 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
14075 /* Don't emit an explicit store pair as this may not be always profitable.
14076 Let the sched-fusion logic decide whether to merge them. */
14077 emit_move_insn (mem_lo
, tmp_reg
);
14078 emit_move_insn (mem_hi
, tmp_reg
);
14083 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14085 static unsigned HOST_WIDE_INT
14086 aarch64_asan_shadow_offset (void)
14088 return (HOST_WIDE_INT_1
<< 36);
14092 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size
,
14093 unsigned int align
,
14094 enum by_pieces_operation op
,
14097 /* STORE_BY_PIECES can be used when copying a constant string, but
14098 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14099 For now we always fail this and let the move_by_pieces code copy
14100 the string from read-only memory. */
14101 if (op
== STORE_BY_PIECES
)
14104 return default_use_by_pieces_infrastructure_p (size
, align
, op
, speed_p
);
14108 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
14109 int code
, tree treeop0
, tree treeop1
)
14111 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
14113 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
14115 struct expand_operand ops
[4];
14118 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
14120 op_mode
= GET_MODE (op0
);
14121 if (op_mode
== VOIDmode
)
14122 op_mode
= GET_MODE (op1
);
14130 icode
= CODE_FOR_cmpsi
;
14135 icode
= CODE_FOR_cmpdi
;
14140 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
14141 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
14146 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
14147 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
14155 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
14156 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
14162 *prep_seq
= get_insns ();
14165 create_fixed_operand (&ops
[0], op0
);
14166 create_fixed_operand (&ops
[1], op1
);
14169 if (!maybe_expand_insn (icode
, 2, ops
))
14174 *gen_seq
= get_insns ();
14177 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
14178 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
14182 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
14183 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
14185 rtx op0
, op1
, target
;
14186 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
14187 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
14189 struct expand_operand ops
[6];
14192 push_to_sequence (*prep_seq
);
14193 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
14195 op_mode
= GET_MODE (op0
);
14196 if (op_mode
== VOIDmode
)
14197 op_mode
= GET_MODE (op1
);
14205 icode
= CODE_FOR_ccmpsi
;
14210 icode
= CODE_FOR_ccmpdi
;
14215 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
14216 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
14221 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
14222 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
14230 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
14231 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
14237 *prep_seq
= get_insns ();
14240 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
14241 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
14243 if (bit_code
!= AND
)
14245 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
14246 GET_MODE (XEXP (prev
, 0))),
14247 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
14248 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
14251 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
14252 create_fixed_operand (&ops
[1], target
);
14253 create_fixed_operand (&ops
[2], op0
);
14254 create_fixed_operand (&ops
[3], op1
);
14255 create_fixed_operand (&ops
[4], prev
);
14256 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
14258 push_to_sequence (*gen_seq
);
14259 if (!maybe_expand_insn (icode
, 6, ops
))
14265 *gen_seq
= get_insns ();
14268 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
14271 #undef TARGET_GEN_CCMP_FIRST
14272 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14274 #undef TARGET_GEN_CCMP_NEXT
14275 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14277 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14278 instruction fusion of some sort. */
14281 aarch64_macro_fusion_p (void)
14283 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
14287 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14288 should be kept together during scheduling. */
14291 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
14294 rtx prev_set
= single_set (prev
);
14295 rtx curr_set
= single_set (curr
);
14296 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14297 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
14299 if (!aarch64_macro_fusion_p ())
14302 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
14304 /* We are trying to match:
14305 prev (mov) == (set (reg r0) (const_int imm16))
14306 curr (movk) == (set (zero_extract (reg r0)
14309 (const_int imm16_1)) */
14311 set_dest
= SET_DEST (curr_set
);
14313 if (GET_CODE (set_dest
) == ZERO_EXTRACT
14314 && CONST_INT_P (SET_SRC (curr_set
))
14315 && CONST_INT_P (SET_SRC (prev_set
))
14316 && CONST_INT_P (XEXP (set_dest
, 2))
14317 && INTVAL (XEXP (set_dest
, 2)) == 16
14318 && REG_P (XEXP (set_dest
, 0))
14319 && REG_P (SET_DEST (prev_set
))
14320 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
14326 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
14329 /* We're trying to match:
14330 prev (adrp) == (set (reg r1)
14331 (high (symbol_ref ("SYM"))))
14332 curr (add) == (set (reg r0)
14334 (symbol_ref ("SYM"))))
14335 Note that r0 need not necessarily be the same as r1, especially
14336 during pre-regalloc scheduling. */
14338 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
14339 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
14341 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
14342 && REG_P (XEXP (SET_SRC (curr_set
), 0))
14343 && REGNO (XEXP (SET_SRC (curr_set
), 0))
14344 == REGNO (SET_DEST (prev_set
))
14345 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
14346 XEXP (SET_SRC (curr_set
), 1)))
14351 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
14354 /* We're trying to match:
14355 prev (movk) == (set (zero_extract (reg r0)
14358 (const_int imm16_1))
14359 curr (movk) == (set (zero_extract (reg r0)
14362 (const_int imm16_2)) */
14364 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
14365 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
14366 && REG_P (XEXP (SET_DEST (prev_set
), 0))
14367 && REG_P (XEXP (SET_DEST (curr_set
), 0))
14368 && REGNO (XEXP (SET_DEST (prev_set
), 0))
14369 == REGNO (XEXP (SET_DEST (curr_set
), 0))
14370 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
14371 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
14372 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
14373 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
14374 && CONST_INT_P (SET_SRC (prev_set
))
14375 && CONST_INT_P (SET_SRC (curr_set
)))
14379 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
14381 /* We're trying to match:
14382 prev (adrp) == (set (reg r0)
14383 (high (symbol_ref ("SYM"))))
14384 curr (ldr) == (set (reg r1)
14385 (mem (lo_sum (reg r0)
14386 (symbol_ref ("SYM")))))
14388 curr (ldr) == (set (reg r1)
14391 (symbol_ref ("SYM")))))) */
14392 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
14393 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
14395 rtx curr_src
= SET_SRC (curr_set
);
14397 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
14398 curr_src
= XEXP (curr_src
, 0);
14400 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
14401 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
14402 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
14403 == REGNO (SET_DEST (prev_set
))
14404 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
14405 XEXP (SET_SRC (prev_set
), 0)))
14410 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
14411 && aarch_crypto_can_dual_issue (prev
, curr
))
14414 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
14415 && any_condjump_p (curr
))
14417 enum attr_type prev_type
= get_attr_type (prev
);
14419 unsigned int condreg1
, condreg2
;
14421 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
14422 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
14424 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
14426 && modified_in_p (cc_reg_1
, prev
))
14428 /* FIXME: this misses some which is considered simple arthematic
14429 instructions for ThunderX. Simple shifts are missed here. */
14430 if (prev_type
== TYPE_ALUS_SREG
14431 || prev_type
== TYPE_ALUS_IMM
14432 || prev_type
== TYPE_LOGICS_REG
14433 || prev_type
== TYPE_LOGICS_IMM
)
14440 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
14441 && any_condjump_p (curr
))
14443 /* We're trying to match:
14444 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14445 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14447 (label_ref ("SYM"))
14449 if (SET_DEST (curr_set
) == (pc_rtx
)
14450 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
14451 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
14452 && REG_P (SET_DEST (prev_set
))
14453 && REGNO (SET_DEST (prev_set
))
14454 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
14456 /* Fuse ALU operations followed by conditional branch instruction. */
14457 switch (get_attr_type (prev
))
14460 case TYPE_ALU_SREG
:
14463 case TYPE_ADCS_REG
:
14464 case TYPE_ADCS_IMM
:
14465 case TYPE_LOGIC_REG
:
14466 case TYPE_LOGIC_IMM
:
14470 case TYPE_SHIFT_REG
:
14471 case TYPE_SHIFT_IMM
:
14486 /* Return true iff the instruction fusion described by OP is enabled. */
14489 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
14491 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
14494 /* If MEM is in the form of [base+offset], extract the two parts
14495 of address and set to BASE and OFFSET, otherwise return false
14496 after clearing BASE and OFFSET. */
14499 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
14503 gcc_assert (MEM_P (mem
));
14505 addr
= XEXP (mem
, 0);
14510 *offset
= const0_rtx
;
14514 if (GET_CODE (addr
) == PLUS
14515 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
14517 *base
= XEXP (addr
, 0);
14518 *offset
= XEXP (addr
, 1);
14523 *offset
= NULL_RTX
;
14528 /* Types for scheduling fusion. */
14529 enum sched_fusion_type
14531 SCHED_FUSION_NONE
= 0,
14532 SCHED_FUSION_LD_SIGN_EXTEND
,
14533 SCHED_FUSION_LD_ZERO_EXTEND
,
14539 /* If INSN is a load or store of address in the form of [base+offset],
14540 extract the two parts and set to BASE and OFFSET. Return scheduling
14541 fusion type this INSN is. */
14543 static enum sched_fusion_type
14544 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
14547 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
14549 gcc_assert (INSN_P (insn
));
14550 x
= PATTERN (insn
);
14551 if (GET_CODE (x
) != SET
)
14552 return SCHED_FUSION_NONE
;
14555 dest
= SET_DEST (x
);
14557 machine_mode dest_mode
= GET_MODE (dest
);
14559 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
14560 return SCHED_FUSION_NONE
;
14562 if (GET_CODE (src
) == SIGN_EXTEND
)
14564 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
14565 src
= XEXP (src
, 0);
14566 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
14567 return SCHED_FUSION_NONE
;
14569 else if (GET_CODE (src
) == ZERO_EXTEND
)
14571 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
14572 src
= XEXP (src
, 0);
14573 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
14574 return SCHED_FUSION_NONE
;
14577 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
14578 extract_base_offset_in_addr (src
, base
, offset
);
14579 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
14581 fusion
= SCHED_FUSION_ST
;
14582 extract_base_offset_in_addr (dest
, base
, offset
);
14585 return SCHED_FUSION_NONE
;
14587 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
14588 fusion
= SCHED_FUSION_NONE
;
14593 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14595 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14596 and PRI are only calculated for these instructions. For other instruction,
14597 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14598 type instruction fusion can be added by returning different priorities.
14600 It's important that irrelevant instructions get the largest FUSION_PRI. */
14603 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
14604 int *fusion_pri
, int *pri
)
14608 enum sched_fusion_type fusion
;
14610 gcc_assert (INSN_P (insn
));
14613 fusion
= fusion_load_store (insn
, &base
, &offset
);
14614 if (fusion
== SCHED_FUSION_NONE
)
14621 /* Set FUSION_PRI according to fusion type and base register. */
14622 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
14624 /* Calculate PRI. */
14627 /* INSN with smaller offset goes first. */
14628 off_val
= (int)(INTVAL (offset
));
14630 tmp
-= (off_val
& 0xfffff);
14632 tmp
+= ((- off_val
) & 0xfffff);
14638 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14639 Adjust priority of sha1h instructions so they are scheduled before
14640 other SHA1 instructions. */
14643 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
14645 rtx x
= PATTERN (insn
);
14647 if (GET_CODE (x
) == SET
)
14651 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
14652 return priority
+ 10;
14658 /* Given OPERANDS of consecutive load/store, check if we can merge
14659 them into ldp/stp. LOAD is true if they are load instructions.
14660 MODE is the mode of memory operands. */
14663 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
14666 HOST_WIDE_INT offval_1
, offval_2
, msize
;
14667 enum reg_class rclass_1
, rclass_2
;
14668 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
14672 mem_1
= operands
[1];
14673 mem_2
= operands
[3];
14674 reg_1
= operands
[0];
14675 reg_2
= operands
[2];
14676 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
14677 if (REGNO (reg_1
) == REGNO (reg_2
))
14682 mem_1
= operands
[0];
14683 mem_2
= operands
[2];
14684 reg_1
= operands
[1];
14685 reg_2
= operands
[3];
14688 /* The mems cannot be volatile. */
14689 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
14692 /* If we have SImode and slow unaligned ldp,
14693 check the alignment to be at least 8 byte. */
14695 && (aarch64_tune_params
.extra_tuning_flags
14696 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
14698 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
14701 /* Check if the addresses are in the form of [base+offset]. */
14702 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
14703 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
14705 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
14706 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
14709 /* Check if the bases are same. */
14710 if (!rtx_equal_p (base_1
, base_2
))
14713 offval_1
= INTVAL (offset_1
);
14714 offval_2
= INTVAL (offset_2
);
14715 msize
= GET_MODE_SIZE (mode
);
14716 /* Check if the offsets are consecutive. */
14717 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
14720 /* Check if the addresses are clobbered by load. */
14723 if (reg_mentioned_p (reg_1
, mem_1
))
14726 /* In increasing order, the last load can clobber the address. */
14727 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
14731 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
14732 rclass_1
= FP_REGS
;
14734 rclass_1
= GENERAL_REGS
;
14736 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
14737 rclass_2
= FP_REGS
;
14739 rclass_2
= GENERAL_REGS
;
14741 /* Check if the registers are of same class. */
14742 if (rclass_1
!= rclass_2
)
14748 /* Given OPERANDS of consecutive load/store, check if we can merge
14749 them into ldp/stp by adjusting the offset. LOAD is true if they
14750 are load instructions. MODE is the mode of memory operands.
14752 Given below consecutive stores:
14754 str w1, [xb, 0x100]
14755 str w1, [xb, 0x104]
14756 str w1, [xb, 0x108]
14757 str w1, [xb, 0x10c]
14759 Though the offsets are out of the range supported by stp, we can
14760 still pair them after adjusting the offset, like:
14762 add scratch, xb, 0x100
14763 stp w1, w1, [scratch]
14764 stp w1, w1, [scratch, 0x8]
14766 The peephole patterns detecting this opportunity should guarantee
14767 the scratch register is avaliable. */
14770 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
14773 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
14774 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
14775 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
14776 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
14780 reg_1
= operands
[0];
14781 mem_1
= operands
[1];
14782 reg_2
= operands
[2];
14783 mem_2
= operands
[3];
14784 reg_3
= operands
[4];
14785 mem_3
= operands
[5];
14786 reg_4
= operands
[6];
14787 mem_4
= operands
[7];
14788 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
14789 && REG_P (reg_3
) && REG_P (reg_4
));
14790 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
14795 mem_1
= operands
[0];
14796 reg_1
= operands
[1];
14797 mem_2
= operands
[2];
14798 reg_2
= operands
[3];
14799 mem_3
= operands
[4];
14800 reg_3
= operands
[5];
14801 mem_4
= operands
[6];
14802 reg_4
= operands
[7];
14804 /* Skip if memory operand is by itslef valid for ldp/stp. */
14805 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
14808 /* The mems cannot be volatile. */
14809 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
14810 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
14813 /* Check if the addresses are in the form of [base+offset]. */
14814 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
14815 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
14817 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
14818 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
14820 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
14821 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
14823 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
14824 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
14827 /* Check if the bases are same. */
14828 if (!rtx_equal_p (base_1
, base_2
)
14829 || !rtx_equal_p (base_2
, base_3
)
14830 || !rtx_equal_p (base_3
, base_4
))
14833 offval_1
= INTVAL (offset_1
);
14834 offval_2
= INTVAL (offset_2
);
14835 offval_3
= INTVAL (offset_3
);
14836 offval_4
= INTVAL (offset_4
);
14837 msize
= GET_MODE_SIZE (mode
);
14838 /* Check if the offsets are consecutive. */
14839 if ((offval_1
!= (offval_2
+ msize
)
14840 || offval_1
!= (offval_3
+ msize
* 2)
14841 || offval_1
!= (offval_4
+ msize
* 3))
14842 && (offval_4
!= (offval_3
+ msize
)
14843 || offval_4
!= (offval_2
+ msize
* 2)
14844 || offval_4
!= (offval_1
+ msize
* 3)))
14847 /* Check if the addresses are clobbered by load. */
14850 if (reg_mentioned_p (reg_1
, mem_1
)
14851 || reg_mentioned_p (reg_2
, mem_2
)
14852 || reg_mentioned_p (reg_3
, mem_3
))
14855 /* In increasing order, the last load can clobber the address. */
14856 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
14860 /* If we have SImode and slow unaligned ldp,
14861 check the alignment to be at least 8 byte. */
14863 && (aarch64_tune_params
.extra_tuning_flags
14864 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
14866 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
14869 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
14870 rclass_1
= FP_REGS
;
14872 rclass_1
= GENERAL_REGS
;
14874 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
14875 rclass_2
= FP_REGS
;
14877 rclass_2
= GENERAL_REGS
;
14879 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
14880 rclass_3
= FP_REGS
;
14882 rclass_3
= GENERAL_REGS
;
14884 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
14885 rclass_4
= FP_REGS
;
14887 rclass_4
= GENERAL_REGS
;
14889 /* Check if the registers are of same class. */
14890 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
14896 /* Given OPERANDS of consecutive load/store, this function pairs them
14897 into ldp/stp after adjusting the offset. It depends on the fact
14898 that addresses of load/store instructions are in increasing order.
14899 MODE is the mode of memory operands. CODE is the rtl operator
14900 which should be applied to all memory operands, it's SIGN_EXTEND,
14901 ZERO_EXTEND or UNKNOWN. */
14904 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
14905 machine_mode mode
, RTX_CODE code
)
14907 rtx base
, offset
, t1
, t2
;
14908 rtx mem_1
, mem_2
, mem_3
, mem_4
;
14909 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
14913 mem_1
= operands
[1];
14914 mem_2
= operands
[3];
14915 mem_3
= operands
[5];
14916 mem_4
= operands
[7];
14920 mem_1
= operands
[0];
14921 mem_2
= operands
[2];
14922 mem_3
= operands
[4];
14923 mem_4
= operands
[6];
14924 gcc_assert (code
== UNKNOWN
);
14927 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
14928 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
14930 /* Adjust offset thus it can fit in ldp/stp instruction. */
14931 msize
= GET_MODE_SIZE (mode
);
14932 stp_off_limit
= msize
* 0x40;
14933 off_val
= INTVAL (offset
);
14934 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
14935 new_off
= abs_off
% stp_off_limit
;
14936 adj_off
= abs_off
- new_off
;
14938 /* Further adjust to make sure all offsets are OK. */
14939 if ((new_off
+ msize
* 2) >= stp_off_limit
)
14941 adj_off
+= stp_off_limit
;
14942 new_off
-= stp_off_limit
;
14945 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14946 if (adj_off
>= 0x1000)
14951 adj_off
= -adj_off
;
14952 new_off
= -new_off
;
14955 /* Create new memory references. */
14956 mem_1
= change_address (mem_1
, VOIDmode
,
14957 plus_constant (DImode
, operands
[8], new_off
));
14959 /* Check if the adjusted address is OK for ldp/stp. */
14960 if (!aarch64_mem_pair_operand (mem_1
, mode
))
14963 msize
= GET_MODE_SIZE (mode
);
14964 mem_2
= change_address (mem_2
, VOIDmode
,
14965 plus_constant (DImode
,
14968 mem_3
= change_address (mem_3
, VOIDmode
,
14969 plus_constant (DImode
,
14971 new_off
+ msize
* 2));
14972 mem_4
= change_address (mem_4
, VOIDmode
,
14973 plus_constant (DImode
,
14975 new_off
+ msize
* 3));
14977 if (code
== ZERO_EXTEND
)
14979 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
14980 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
14981 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
14982 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
14984 else if (code
== SIGN_EXTEND
)
14986 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
14987 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
14988 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
14989 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
14994 operands
[1] = mem_1
;
14995 operands
[3] = mem_2
;
14996 operands
[5] = mem_3
;
14997 operands
[7] = mem_4
;
15001 operands
[0] = mem_1
;
15002 operands
[2] = mem_2
;
15003 operands
[4] = mem_3
;
15004 operands
[6] = mem_4
;
15007 /* Emit adjusting instruction. */
15008 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, adj_off
)));
15009 /* Emit ldp/stp instructions. */
15010 t1
= gen_rtx_SET (operands
[0], operands
[1]);
15011 t2
= gen_rtx_SET (operands
[2], operands
[3]);
15012 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
15013 t1
= gen_rtx_SET (operands
[4], operands
[5]);
15014 t2
= gen_rtx_SET (operands
[6], operands
[7]);
15015 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
15019 /* Return 1 if pseudo register should be created and used to hold
15020 GOT address for PIC code. */
15023 aarch64_use_pseudo_pic_reg (void)
15025 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
15028 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15031 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
15033 switch (XINT (x
, 1))
15035 case UNSPEC_GOTSMALLPIC
:
15036 case UNSPEC_GOTSMALLPIC28K
:
15037 case UNSPEC_GOTTINYPIC
:
15043 return default_unspec_may_trap_p (x
, flags
);
15047 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15048 return the log2 of that value. Otherwise return -1. */
15051 aarch64_fpconst_pow_of_2 (rtx x
)
15053 const REAL_VALUE_TYPE
*r
;
15055 if (!CONST_DOUBLE_P (x
))
15058 r
= CONST_DOUBLE_REAL_VALUE (x
);
15060 if (REAL_VALUE_NEGATIVE (*r
)
15061 || REAL_VALUE_ISNAN (*r
)
15062 || REAL_VALUE_ISINF (*r
)
15063 || !real_isinteger (r
, DFmode
))
15066 return exact_log2 (real_to_integer (r
));
15069 /* If X is a vector of equal CONST_DOUBLE values and that value is
15070 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15073 aarch64_vec_fpconst_pow_of_2 (rtx x
)
15075 if (GET_CODE (x
) != CONST_VECTOR
)
15078 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
15081 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
15085 for (int i
= 1; i
< CONST_VECTOR_NUNITS (x
); i
++)
15086 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
15092 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15095 __fp16 always promotes through this hook.
15096 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15097 through the generic excess precision logic rather than here. */
15100 aarch64_promoted_type (const_tree t
)
15102 if (SCALAR_FLOAT_TYPE_P (t
)
15103 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
15104 return float_type_node
;
15109 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15112 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
15113 optimization_type opt_type
)
15118 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
15125 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15126 if MODE is HFmode, and punt to the generic implementation otherwise. */
15129 aarch64_libgcc_floating_mode_supported_p (machine_mode mode
)
15131 return (mode
== HFmode
15133 : default_libgcc_floating_mode_supported_p (mode
));
15136 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15137 if MODE is HFmode, and punt to the generic implementation otherwise. */
15140 aarch64_scalar_mode_supported_p (machine_mode mode
)
15142 return (mode
== HFmode
15144 : default_scalar_mode_supported_p (mode
));
15147 /* Set the value of FLT_EVAL_METHOD.
15148 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15150 0: evaluate all operations and constants, whose semantic type has at
15151 most the range and precision of type float, to the range and
15152 precision of float; evaluate all other operations and constants to
15153 the range and precision of the semantic type;
15155 N, where _FloatN is a supported interchange floating type
15156 evaluate all operations and constants, whose semantic type has at
15157 most the range and precision of _FloatN type, to the range and
15158 precision of the _FloatN type; evaluate all other operations and
15159 constants to the range and precision of the semantic type;
15161 If we have the ARMv8.2-A extensions then we support _Float16 in native
15162 precision, so we should set this to 16. Otherwise, we support the type,
15163 but want to evaluate expressions in float precision, so set this to
15166 static enum flt_eval_method
15167 aarch64_excess_precision (enum excess_precision_type type
)
15171 case EXCESS_PRECISION_TYPE_FAST
:
15172 case EXCESS_PRECISION_TYPE_STANDARD
:
15173 /* We can calculate either in 16-bit range and precision or
15174 32-bit range and precision. Make that decision based on whether
15175 we have native support for the ARMv8.2-A 16-bit floating-point
15176 instructions or not. */
15177 return (TARGET_FP_F16INST
15178 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15179 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
15180 case EXCESS_PRECISION_TYPE_IMPLICIT
:
15181 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
15183 gcc_unreachable ();
15185 return FLT_EVAL_METHOD_UNPREDICTABLE
;
15188 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15189 scheduled for speculative execution. Reject the long-running division
15190 and square-root instructions. */
15193 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
15195 switch (get_attr_type (insn
))
15203 case TYPE_NEON_FP_SQRT_S
:
15204 case TYPE_NEON_FP_SQRT_D
:
15205 case TYPE_NEON_FP_SQRT_S_Q
:
15206 case TYPE_NEON_FP_SQRT_D_Q
:
15207 case TYPE_NEON_FP_DIV_S
:
15208 case TYPE_NEON_FP_DIV_D
:
15209 case TYPE_NEON_FP_DIV_S_Q
:
15210 case TYPE_NEON_FP_DIV_D_Q
:
15217 /* Target-specific selftests. */
15221 namespace selftest
{
15223 /* Selftest for the RTL loader.
15224 Verify that the RTL loader copes with a dump from
15225 print_rtx_function. This is essentially just a test that class
15226 function_reader can handle a real dump, but it also verifies
15227 that lookup_reg_by_dump_name correctly handles hard regs.
15228 The presence of hard reg names in the dump means that the test is
15229 target-specific, hence it is in this file. */
15232 aarch64_test_loading_full_dump ()
15234 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
15236 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
15238 rtx_insn
*insn_1
= get_insn_by_uid (1);
15239 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
15241 rtx_insn
*insn_15
= get_insn_by_uid (15);
15242 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
15243 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
15245 /* Verify crtl->return_rtx. */
15246 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
15247 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
15248 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
15251 /* Run all target-specific selftests. */
15254 aarch64_run_selftests (void)
15256 aarch64_test_loading_full_dump ();
15259 } // namespace selftest
15261 #endif /* #if CHECKING_P */
15263 #undef TARGET_ADDRESS_COST
15264 #define TARGET_ADDRESS_COST aarch64_address_cost
15266 /* This hook will determines whether unnamed bitfields affect the alignment
15267 of the containing structure. The hook returns true if the structure
15268 should inherit the alignment requirements of an unnamed bitfield's
15270 #undef TARGET_ALIGN_ANON_BITFIELD
15271 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15273 #undef TARGET_ASM_ALIGNED_DI_OP
15274 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15276 #undef TARGET_ASM_ALIGNED_HI_OP
15277 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15279 #undef TARGET_ASM_ALIGNED_SI_OP
15280 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15282 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15283 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15284 hook_bool_const_tree_hwi_hwi_const_tree_true
15286 #undef TARGET_ASM_FILE_START
15287 #define TARGET_ASM_FILE_START aarch64_start_file
15289 #undef TARGET_ASM_OUTPUT_MI_THUNK
15290 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15292 #undef TARGET_ASM_SELECT_RTX_SECTION
15293 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15295 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15296 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15298 #undef TARGET_BUILD_BUILTIN_VA_LIST
15299 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15301 #undef TARGET_CALLEE_COPIES
15302 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15304 #undef TARGET_CAN_ELIMINATE
15305 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15307 #undef TARGET_CAN_INLINE_P
15308 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15310 #undef TARGET_CANNOT_FORCE_CONST_MEM
15311 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15313 #undef TARGET_CASE_VALUES_THRESHOLD
15314 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15316 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15317 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15319 /* Only the least significant bit is used for initialization guard
15321 #undef TARGET_CXX_GUARD_MASK_BIT
15322 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15324 #undef TARGET_C_MODE_FOR_SUFFIX
15325 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15327 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15328 #undef TARGET_DEFAULT_TARGET_FLAGS
15329 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15332 #undef TARGET_CLASS_MAX_NREGS
15333 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15335 #undef TARGET_BUILTIN_DECL
15336 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15338 #undef TARGET_BUILTIN_RECIPROCAL
15339 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15341 #undef TARGET_C_EXCESS_PRECISION
15342 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15344 #undef TARGET_EXPAND_BUILTIN
15345 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15347 #undef TARGET_EXPAND_BUILTIN_VA_START
15348 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15350 #undef TARGET_FOLD_BUILTIN
15351 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15353 #undef TARGET_FUNCTION_ARG
15354 #define TARGET_FUNCTION_ARG aarch64_function_arg
15356 #undef TARGET_FUNCTION_ARG_ADVANCE
15357 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15359 #undef TARGET_FUNCTION_ARG_BOUNDARY
15360 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15362 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15363 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15365 #undef TARGET_FUNCTION_VALUE
15366 #define TARGET_FUNCTION_VALUE aarch64_function_value
15368 #undef TARGET_FUNCTION_VALUE_REGNO_P
15369 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15371 #undef TARGET_FRAME_POINTER_REQUIRED
15372 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15374 #undef TARGET_GIMPLE_FOLD_BUILTIN
15375 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15377 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15378 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15380 #undef TARGET_INIT_BUILTINS
15381 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15383 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15384 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15385 aarch64_ira_change_pseudo_allocno_class
15387 #undef TARGET_LEGITIMATE_ADDRESS_P
15388 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15390 #undef TARGET_LEGITIMATE_CONSTANT_P
15391 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15393 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15394 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15395 aarch64_legitimize_address_displacement
15397 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15398 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15400 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15401 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15402 aarch64_libgcc_floating_mode_supported_p
15404 #undef TARGET_MANGLE_TYPE
15405 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15407 #undef TARGET_MEMORY_MOVE_COST
15408 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15410 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15411 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15413 #undef TARGET_MUST_PASS_IN_STACK
15414 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15416 /* This target hook should return true if accesses to volatile bitfields
15417 should use the narrowest mode possible. It should return false if these
15418 accesses should use the bitfield container type. */
15419 #undef TARGET_NARROW_VOLATILE_BITFIELD
15420 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15422 #undef TARGET_OPTION_OVERRIDE
15423 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15425 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15426 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15427 aarch64_override_options_after_change
15429 #undef TARGET_OPTION_SAVE
15430 #define TARGET_OPTION_SAVE aarch64_option_save
15432 #undef TARGET_OPTION_RESTORE
15433 #define TARGET_OPTION_RESTORE aarch64_option_restore
15435 #undef TARGET_OPTION_PRINT
15436 #define TARGET_OPTION_PRINT aarch64_option_print
15438 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15439 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15441 #undef TARGET_SET_CURRENT_FUNCTION
15442 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15444 #undef TARGET_PASS_BY_REFERENCE
15445 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15447 #undef TARGET_PREFERRED_RELOAD_CLASS
15448 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15450 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15451 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15453 #undef TARGET_PROMOTED_TYPE
15454 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15456 #undef TARGET_SECONDARY_RELOAD
15457 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15459 #undef TARGET_SHIFT_TRUNCATION_MASK
15460 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15462 #undef TARGET_SETUP_INCOMING_VARARGS
15463 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15465 #undef TARGET_STRUCT_VALUE_RTX
15466 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15468 #undef TARGET_REGISTER_MOVE_COST
15469 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15471 #undef TARGET_RETURN_IN_MEMORY
15472 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15474 #undef TARGET_RETURN_IN_MSB
15475 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15477 #undef TARGET_RTX_COSTS
15478 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15480 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15481 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15483 #undef TARGET_SCHED_ISSUE_RATE
15484 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15486 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15487 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15488 aarch64_sched_first_cycle_multipass_dfa_lookahead
15490 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15491 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15492 aarch64_first_cycle_multipass_dfa_lookahead_guard
15494 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15495 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15496 aarch64_get_separate_components
15498 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15499 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15500 aarch64_components_for_bb
15502 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15503 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15504 aarch64_disqualify_components
15506 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15507 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15508 aarch64_emit_prologue_components
15510 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15511 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15512 aarch64_emit_epilogue_components
15514 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15515 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15516 aarch64_set_handled_components
15518 #undef TARGET_TRAMPOLINE_INIT
15519 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15521 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15522 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15524 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15525 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15527 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15528 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15529 aarch64_builtin_support_vector_misalignment
15531 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15532 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15534 #undef TARGET_VECTORIZE_ADD_STMT_COST
15535 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15537 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15538 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15539 aarch64_builtin_vectorization_cost
15541 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15542 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15544 #undef TARGET_VECTORIZE_BUILTINS
15545 #define TARGET_VECTORIZE_BUILTINS
15547 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15548 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15549 aarch64_builtin_vectorized_function
15551 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15552 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15553 aarch64_autovectorize_vector_sizes
15555 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15556 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15557 aarch64_atomic_assign_expand_fenv
15559 /* Section anchor support. */
15561 #undef TARGET_MIN_ANCHOR_OFFSET
15562 #define TARGET_MIN_ANCHOR_OFFSET -256
15564 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15565 byte offset; we can do much more for larger data types, but have no way
15566 to determine the size of the access. We assume accesses are aligned. */
15567 #undef TARGET_MAX_ANCHOR_OFFSET
15568 #define TARGET_MAX_ANCHOR_OFFSET 4095
15570 #undef TARGET_VECTOR_ALIGNMENT
15571 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15573 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15574 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15575 aarch64_simd_vector_alignment_reachable
15577 /* vec_perm support. */
15579 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15580 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15581 aarch64_vectorize_vec_perm_const_ok
15583 #undef TARGET_INIT_LIBFUNCS
15584 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15586 #undef TARGET_FIXED_CONDITION_CODE_REGS
15587 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15589 #undef TARGET_FLAGS_REGNUM
15590 #define TARGET_FLAGS_REGNUM CC_REGNUM
15592 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15593 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15595 #undef TARGET_ASAN_SHADOW_OFFSET
15596 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15598 #undef TARGET_LEGITIMIZE_ADDRESS
15599 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15601 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15602 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15603 aarch64_use_by_pieces_infrastructure_p
15605 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15606 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15608 #undef TARGET_CAN_USE_DOLOOP_P
15609 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15611 #undef TARGET_SCHED_ADJUST_PRIORITY
15612 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15614 #undef TARGET_SCHED_MACRO_FUSION_P
15615 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15617 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15618 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15620 #undef TARGET_SCHED_FUSION_PRIORITY
15621 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15623 #undef TARGET_UNSPEC_MAY_TRAP_P
15624 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15626 #undef TARGET_USE_PSEUDO_PIC_REG
15627 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15629 #undef TARGET_PRINT_OPERAND
15630 #define TARGET_PRINT_OPERAND aarch64_print_operand
15632 #undef TARGET_PRINT_OPERAND_ADDRESS
15633 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15635 #undef TARGET_OPTAB_SUPPORTED_P
15636 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15638 #undef TARGET_OMIT_STRUCT_RETURN_REG
15639 #define TARGET_OMIT_STRUCT_RETURN_REG true
15641 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15642 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15643 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15646 #undef TARGET_RUN_TARGET_SELFTESTS
15647 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15648 #endif /* #if CHECKING_P */
15650 struct gcc_target targetm
= TARGET_INITIALIZER
;
15652 #include "gt-aarch64.h"