1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_STRING
24 #include "coretypes.h"
35 #include "stringpool.h"
41 #include "diagnostic.h"
42 #include "insn-attr.h"
44 #include "fold-const.h"
45 #include "stor-layout.h"
53 #include "langhooks.h"
58 #include "gimple-iterator.h"
59 #include "tree-vectorizer.h"
60 #include "aarch64-cost-tables.h"
64 #include "tm-constrs.h"
65 #include "sched-int.h"
66 #include "target-globals.h"
67 #include "common/common-target.h"
69 #include "selftest-rtl.h"
71 /* This file should be included last. */
72 #include "target-def.h"
74 /* Defined for convenience. */
75 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
77 /* Classifies an address.
80 A simple base register plus immediate offset.
83 A base register indexed by immediate offset with writeback.
86 A base register indexed by (optionally scaled) register.
89 A base register indexed by (optionally scaled) zero-extended register.
92 A base register indexed by (optionally scaled) sign-extended register.
95 A LO_SUM rtx with a base register and "LO12" symbol relocation.
98 A constant symbolic address, in pc-relative literal pool. */
100 enum aarch64_address_type
{
110 struct aarch64_address_info
{
111 enum aarch64_address_type type
;
115 enum aarch64_symbol_type symbol_type
;
118 struct simd_immediate_info
127 /* The current code model. */
128 enum aarch64_code_model aarch64_cmodel
;
131 #undef TARGET_HAVE_TLS
132 #define TARGET_HAVE_TLS 1
135 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
136 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
138 machine_mode
*, int *,
140 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
141 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
142 static void aarch64_override_options_after_change (void);
143 static bool aarch64_vector_mode_supported_p (machine_mode
);
144 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode
,
146 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
147 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
152 aarch64_simd_container_mode (scalar_mode mode
, unsigned width
);
154 /* Major revision number of the ARM Architecture implemented by the target. */
155 unsigned aarch64_architecture_version
;
157 /* The processor for which instructions should be scheduled. */
158 enum aarch64_processor aarch64_tune
= cortexa53
;
160 /* Mask to specify which instruction scheduling options should be used. */
161 unsigned long aarch64_tune_flags
= 0;
163 /* Global flag for PC relative loads. */
164 bool aarch64_pcrelative_literal_loads
;
166 /* Support for command line parsing of boolean flags in the tuning
168 struct aarch64_flag_desc
174 #define AARCH64_FUSION_PAIR(name, internal_name) \
175 { name, AARCH64_FUSE_##internal_name },
176 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
178 { "none", AARCH64_FUSE_NOTHING
},
179 #include "aarch64-fusion-pairs.def"
180 { "all", AARCH64_FUSE_ALL
},
181 { NULL
, AARCH64_FUSE_NOTHING
}
184 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
185 { name, AARCH64_EXTRA_TUNE_##internal_name },
186 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
188 { "none", AARCH64_EXTRA_TUNE_NONE
},
189 #include "aarch64-tuning-flags.def"
190 { "all", AARCH64_EXTRA_TUNE_ALL
},
191 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
194 /* Tuning parameters. */
196 static const struct cpu_addrcost_table generic_addrcost_table
=
206 0, /* register_offset */
207 0, /* register_sextend */
208 0, /* register_zextend */
212 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
222 1, /* register_offset */
223 1, /* register_sextend */
224 2, /* register_zextend */
228 static const struct cpu_addrcost_table xgene1_addrcost_table
=
238 0, /* register_offset */
239 1, /* register_sextend */
240 1, /* register_zextend */
244 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
254 2, /* register_offset */
255 3, /* register_sextend */
256 3, /* register_zextend */
260 static const struct cpu_regmove_cost generic_regmove_cost
=
263 /* Avoid the use of slow int<->fp moves for spilling by setting
264 their cost higher than memmov_cost. */
270 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
273 /* Avoid the use of slow int<->fp moves for spilling by setting
274 their cost higher than memmov_cost. */
280 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
283 /* Avoid the use of slow int<->fp moves for spilling by setting
284 their cost higher than memmov_cost. */
290 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
293 /* Avoid the use of slow int<->fp moves for spilling by setting
294 their cost higher than memmov_cost (actual, 4 and 9). */
300 static const struct cpu_regmove_cost thunderx_regmove_cost
=
308 static const struct cpu_regmove_cost xgene1_regmove_cost
=
311 /* Avoid the use of slow int<->fp moves for spilling by setting
312 their cost higher than memmov_cost. */
318 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
321 /* Avoid the use of int<->fp moves for spilling. */
327 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
330 /* Avoid the use of int<->fp moves for spilling. */
336 /* Generic costs for vector insn classes. */
337 static const struct cpu_vector_cost generic_vector_cost
=
339 1, /* scalar_int_stmt_cost */
340 1, /* scalar_fp_stmt_cost */
341 1, /* scalar_load_cost */
342 1, /* scalar_store_cost */
343 1, /* vec_int_stmt_cost */
344 1, /* vec_fp_stmt_cost */
345 2, /* vec_permute_cost */
346 1, /* vec_to_scalar_cost */
347 1, /* scalar_to_vec_cost */
348 1, /* vec_align_load_cost */
349 1, /* vec_unalign_load_cost */
350 1, /* vec_unalign_store_cost */
351 1, /* vec_store_cost */
352 3, /* cond_taken_branch_cost */
353 1 /* cond_not_taken_branch_cost */
356 /* ThunderX costs for vector insn classes. */
357 static const struct cpu_vector_cost thunderx_vector_cost
=
359 1, /* scalar_int_stmt_cost */
360 1, /* scalar_fp_stmt_cost */
361 3, /* scalar_load_cost */
362 1, /* scalar_store_cost */
363 4, /* vec_int_stmt_cost */
364 1, /* vec_fp_stmt_cost */
365 4, /* vec_permute_cost */
366 2, /* vec_to_scalar_cost */
367 2, /* scalar_to_vec_cost */
368 3, /* vec_align_load_cost */
369 5, /* vec_unalign_load_cost */
370 5, /* vec_unalign_store_cost */
371 1, /* vec_store_cost */
372 3, /* cond_taken_branch_cost */
373 3 /* cond_not_taken_branch_cost */
376 /* Generic costs for vector insn classes. */
377 static const struct cpu_vector_cost cortexa57_vector_cost
=
379 1, /* scalar_int_stmt_cost */
380 1, /* scalar_fp_stmt_cost */
381 4, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 2, /* vec_int_stmt_cost */
384 2, /* vec_fp_stmt_cost */
385 3, /* vec_permute_cost */
386 8, /* vec_to_scalar_cost */
387 8, /* scalar_to_vec_cost */
388 4, /* vec_align_load_cost */
389 4, /* vec_unalign_load_cost */
390 1, /* vec_unalign_store_cost */
391 1, /* vec_store_cost */
392 1, /* cond_taken_branch_cost */
393 1 /* cond_not_taken_branch_cost */
396 static const struct cpu_vector_cost exynosm1_vector_cost
=
398 1, /* scalar_int_stmt_cost */
399 1, /* scalar_fp_stmt_cost */
400 5, /* scalar_load_cost */
401 1, /* scalar_store_cost */
402 3, /* vec_int_stmt_cost */
403 3, /* vec_fp_stmt_cost */
404 3, /* vec_permute_cost */
405 3, /* vec_to_scalar_cost */
406 3, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 /* Generic costs for vector insn classes. */
416 static const struct cpu_vector_cost xgene1_vector_cost
=
418 1, /* scalar_int_stmt_cost */
419 1, /* scalar_fp_stmt_cost */
420 5, /* scalar_load_cost */
421 1, /* scalar_store_cost */
422 2, /* vec_int_stmt_cost */
423 2, /* vec_fp_stmt_cost */
424 2, /* vec_permute_cost */
425 4, /* vec_to_scalar_cost */
426 4, /* scalar_to_vec_cost */
427 10, /* vec_align_load_cost */
428 10, /* vec_unalign_load_cost */
429 2, /* vec_unalign_store_cost */
430 2, /* vec_store_cost */
431 2, /* cond_taken_branch_cost */
432 1 /* cond_not_taken_branch_cost */
435 /* Costs for vector insn classes for Vulcan. */
436 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
438 1, /* scalar_int_stmt_cost */
439 6, /* scalar_fp_stmt_cost */
440 4, /* scalar_load_cost */
441 1, /* scalar_store_cost */
442 5, /* vec_int_stmt_cost */
443 6, /* vec_fp_stmt_cost */
444 3, /* vec_permute_cost */
445 6, /* vec_to_scalar_cost */
446 5, /* scalar_to_vec_cost */
447 8, /* vec_align_load_cost */
448 8, /* vec_unalign_load_cost */
449 4, /* vec_unalign_store_cost */
450 4, /* vec_store_cost */
451 2, /* cond_taken_branch_cost */
452 1 /* cond_not_taken_branch_cost */
455 /* Generic costs for branch instructions. */
456 static const struct cpu_branch_cost generic_branch_cost
=
458 1, /* Predictable. */
459 3 /* Unpredictable. */
462 /* Generic approximation modes. */
463 static const cpu_approx_modes generic_approx_modes
=
465 AARCH64_APPROX_NONE
, /* division */
466 AARCH64_APPROX_NONE
, /* sqrt */
467 AARCH64_APPROX_NONE
/* recip_sqrt */
470 /* Approximation modes for Exynos M1. */
471 static const cpu_approx_modes exynosm1_approx_modes
=
473 AARCH64_APPROX_NONE
, /* division */
474 AARCH64_APPROX_ALL
, /* sqrt */
475 AARCH64_APPROX_ALL
/* recip_sqrt */
478 /* Approximation modes for X-Gene 1. */
479 static const cpu_approx_modes xgene1_approx_modes
=
481 AARCH64_APPROX_NONE
, /* division */
482 AARCH64_APPROX_NONE
, /* sqrt */
483 AARCH64_APPROX_ALL
/* recip_sqrt */
486 /* Generic prefetch settings (which disable prefetch). */
487 static const cpu_prefetch_tune generic_prefetch_tune
=
490 -1, /* l1_cache_size */
491 -1, /* l1_cache_line_size */
492 -1, /* l2_cache_size */
493 -1 /* default_opt_level */
496 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
499 -1, /* l1_cache_size */
500 64, /* l1_cache_line_size */
501 -1, /* l2_cache_size */
502 -1 /* default_opt_level */
505 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
508 32, /* l1_cache_size */
509 64, /* l1_cache_line_size */
510 1024, /* l2_cache_size */
511 3 /* default_opt_level */
514 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
517 32, /* l1_cache_size */
518 128, /* l1_cache_line_size */
519 16*1024, /* l2_cache_size */
520 3 /* default_opt_level */
523 static const cpu_prefetch_tune thunderx_prefetch_tune
=
526 32, /* l1_cache_size */
527 128, /* l1_cache_line_size */
528 -1, /* l2_cache_size */
529 -1 /* default_opt_level */
532 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
535 32, /* l1_cache_size */
536 64, /* l1_cache_line_size */
537 256, /* l2_cache_size */
538 -1 /* default_opt_level */
541 static const struct tune_params generic_tunings
=
543 &cortexa57_extra_costs
,
544 &generic_addrcost_table
,
545 &generic_regmove_cost
,
546 &generic_vector_cost
,
547 &generic_branch_cost
,
548 &generic_approx_modes
,
551 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
552 8, /* function_align. */
555 2, /* int_reassoc_width. */
556 4, /* fp_reassoc_width. */
557 1, /* vec_reassoc_width. */
558 2, /* min_div_recip_mul_sf. */
559 2, /* min_div_recip_mul_df. */
560 0, /* max_case_values. */
561 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
562 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
563 &generic_prefetch_tune
566 static const struct tune_params cortexa35_tunings
=
568 &cortexa53_extra_costs
,
569 &generic_addrcost_table
,
570 &cortexa53_regmove_cost
,
571 &generic_vector_cost
,
572 &generic_branch_cost
,
573 &generic_approx_modes
,
576 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
577 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
578 16, /* function_align. */
581 2, /* int_reassoc_width. */
582 4, /* fp_reassoc_width. */
583 1, /* vec_reassoc_width. */
584 2, /* min_div_recip_mul_sf. */
585 2, /* min_div_recip_mul_df. */
586 0, /* max_case_values. */
587 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
588 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
589 &generic_prefetch_tune
592 static const struct tune_params cortexa53_tunings
=
594 &cortexa53_extra_costs
,
595 &generic_addrcost_table
,
596 &cortexa53_regmove_cost
,
597 &generic_vector_cost
,
598 &generic_branch_cost
,
599 &generic_approx_modes
,
602 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
603 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
604 16, /* function_align. */
607 2, /* int_reassoc_width. */
608 4, /* fp_reassoc_width. */
609 1, /* vec_reassoc_width. */
610 2, /* min_div_recip_mul_sf. */
611 2, /* min_div_recip_mul_df. */
612 0, /* max_case_values. */
613 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
614 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
615 &generic_prefetch_tune
618 static const struct tune_params cortexa57_tunings
=
620 &cortexa57_extra_costs
,
621 &generic_addrcost_table
,
622 &cortexa57_regmove_cost
,
623 &cortexa57_vector_cost
,
624 &generic_branch_cost
,
625 &generic_approx_modes
,
628 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
629 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
630 16, /* function_align. */
633 2, /* int_reassoc_width. */
634 4, /* fp_reassoc_width. */
635 1, /* vec_reassoc_width. */
636 2, /* min_div_recip_mul_sf. */
637 2, /* min_div_recip_mul_df. */
638 0, /* max_case_values. */
639 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
640 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
641 &generic_prefetch_tune
644 static const struct tune_params cortexa72_tunings
=
646 &cortexa57_extra_costs
,
647 &generic_addrcost_table
,
648 &cortexa57_regmove_cost
,
649 &cortexa57_vector_cost
,
650 &generic_branch_cost
,
651 &generic_approx_modes
,
654 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
655 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
656 16, /* function_align. */
659 2, /* int_reassoc_width. */
660 4, /* fp_reassoc_width. */
661 1, /* vec_reassoc_width. */
662 2, /* min_div_recip_mul_sf. */
663 2, /* min_div_recip_mul_df. */
664 0, /* max_case_values. */
665 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
666 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
667 &generic_prefetch_tune
670 static const struct tune_params cortexa73_tunings
=
672 &cortexa57_extra_costs
,
673 &generic_addrcost_table
,
674 &cortexa57_regmove_cost
,
675 &cortexa57_vector_cost
,
676 &generic_branch_cost
,
677 &generic_approx_modes
,
678 4, /* memmov_cost. */
680 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
681 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
682 16, /* function_align. */
685 2, /* int_reassoc_width. */
686 4, /* fp_reassoc_width. */
687 1, /* vec_reassoc_width. */
688 2, /* min_div_recip_mul_sf. */
689 2, /* min_div_recip_mul_df. */
690 0, /* max_case_values. */
691 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
692 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
693 &generic_prefetch_tune
698 static const struct tune_params exynosm1_tunings
=
700 &exynosm1_extra_costs
,
701 &exynosm1_addrcost_table
,
702 &exynosm1_regmove_cost
,
703 &exynosm1_vector_cost
,
704 &generic_branch_cost
,
705 &exynosm1_approx_modes
,
708 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
709 4, /* function_align. */
712 2, /* int_reassoc_width. */
713 4, /* fp_reassoc_width. */
714 1, /* vec_reassoc_width. */
715 2, /* min_div_recip_mul_sf. */
716 2, /* min_div_recip_mul_df. */
717 48, /* max_case_values. */
718 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
719 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
720 &exynosm1_prefetch_tune
723 static const struct tune_params thunderxt88_tunings
=
725 &thunderx_extra_costs
,
726 &generic_addrcost_table
,
727 &thunderx_regmove_cost
,
728 &thunderx_vector_cost
,
729 &generic_branch_cost
,
730 &generic_approx_modes
,
733 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
734 8, /* function_align. */
737 2, /* int_reassoc_width. */
738 4, /* fp_reassoc_width. */
739 1, /* vec_reassoc_width. */
740 2, /* min_div_recip_mul_sf. */
741 2, /* min_div_recip_mul_df. */
742 0, /* max_case_values. */
743 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
744 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
745 &thunderxt88_prefetch_tune
748 static const struct tune_params thunderx_tunings
=
750 &thunderx_extra_costs
,
751 &generic_addrcost_table
,
752 &thunderx_regmove_cost
,
753 &thunderx_vector_cost
,
754 &generic_branch_cost
,
755 &generic_approx_modes
,
758 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
759 8, /* function_align. */
762 2, /* int_reassoc_width. */
763 4, /* fp_reassoc_width. */
764 1, /* vec_reassoc_width. */
765 2, /* min_div_recip_mul_sf. */
766 2, /* min_div_recip_mul_df. */
767 0, /* max_case_values. */
768 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
769 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
770 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
771 &thunderx_prefetch_tune
774 static const struct tune_params xgene1_tunings
=
777 &xgene1_addrcost_table
,
778 &xgene1_regmove_cost
,
780 &generic_branch_cost
,
781 &xgene1_approx_modes
,
784 AARCH64_FUSE_NOTHING
, /* fusible_ops */
785 16, /* function_align. */
787 16, /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
796 &generic_prefetch_tune
799 static const struct tune_params qdf24xx_tunings
=
801 &qdf24xx_extra_costs
,
802 &generic_addrcost_table
,
803 &qdf24xx_regmove_cost
,
804 &generic_vector_cost
,
805 &generic_branch_cost
,
806 &generic_approx_modes
,
809 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
811 16, /* function_align. */
813 16, /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_STRONG
, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
822 &qdf24xx_prefetch_tune
825 static const struct tune_params thunderx2t99_tunings
=
827 &thunderx2t99_extra_costs
,
828 &thunderx2t99_addrcost_table
,
829 &thunderx2t99_regmove_cost
,
830 &thunderx2t99_vector_cost
,
831 &generic_branch_cost
,
832 &generic_approx_modes
,
833 4, /* memmov_cost. */
835 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
836 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
837 16, /* function_align. */
839 16, /* loop_align. */
840 3, /* int_reassoc_width. */
841 2, /* fp_reassoc_width. */
842 2, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
848 &thunderx2t99_prefetch_tune
851 /* Support for fine-grained override of the tuning structures. */
852 struct aarch64_tuning_override_function
855 void (*parse_override
)(const char*, struct tune_params
*);
858 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
859 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
861 static const struct aarch64_tuning_override_function
862 aarch64_tuning_override_functions
[] =
864 { "fuse", aarch64_parse_fuse_string
},
865 { "tune", aarch64_parse_tune_string
},
869 /* A processor implementing AArch64. */
872 const char *const name
;
873 enum aarch64_processor ident
;
874 enum aarch64_processor sched_core
;
875 enum aarch64_arch arch
;
876 unsigned architecture_version
;
877 const unsigned long flags
;
878 const struct tune_params
*const tune
;
881 /* Architectures implementing AArch64. */
882 static const struct processor all_architectures
[] =
884 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
885 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
886 #include "aarch64-arches.def"
887 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
890 /* Processor cores implementing AArch64. */
891 static const struct processor all_cores
[] =
893 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
894 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
895 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
896 FLAGS, &COSTS##_tunings},
897 #include "aarch64-cores.def"
898 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
899 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
900 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
904 /* Target specification. These are populated by the -march, -mtune, -mcpu
905 handling code or by target attributes. */
906 static const struct processor
*selected_arch
;
907 static const struct processor
*selected_cpu
;
908 static const struct processor
*selected_tune
;
910 /* The current tuning set. */
911 struct tune_params aarch64_tune_params
= generic_tunings
;
913 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
915 /* An ISA extension in the co-processor and main instruction set space. */
916 struct aarch64_option_extension
918 const char *const name
;
919 const unsigned long flags_on
;
920 const unsigned long flags_off
;
923 typedef enum aarch64_cond_code
925 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
926 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
927 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
931 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
933 /* The condition codes of the processor, and the inverse function. */
934 static const char * const aarch64_condition_codes
[] =
936 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
937 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
940 /* Generate code to enable conditional branches in functions over 1 MiB. */
942 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
943 const char * branch_format
)
945 rtx_code_label
* tmp_label
= gen_label_rtx ();
948 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
949 CODE_LABEL_NUMBER (tmp_label
));
950 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
951 rtx dest_label
= operands
[pos_label
];
952 operands
[pos_label
] = tmp_label
;
954 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
955 output_asm_insn (buffer
, operands
);
957 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
958 operands
[pos_label
] = dest_label
;
959 output_asm_insn (buffer
, operands
);
964 aarch64_err_no_fpadvsimd (machine_mode mode
, const char *msg
)
966 const char *mc
= FLOAT_MODE_P (mode
) ? "floating-point" : "vector";
967 if (TARGET_GENERAL_REGS_ONLY
)
968 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc
, msg
);
970 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc
, msg
);
973 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
974 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
975 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
976 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
977 cost (in this case the best class is the lowest cost one). Using ALL_REGS
978 irrespectively of its cost results in bad allocations with many redundant
979 int<->FP moves which are expensive on various cores.
980 To avoid this we don't allow ALL_REGS as the allocno class, but force a
981 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
982 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
983 Otherwise set the allocno class depending on the mode.
984 The result of this is that it is no longer inefficient to have a higher
985 memory move cost than the register move cost.
989 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
990 reg_class_t best_class
)
994 if (allocno_class
!= ALL_REGS
)
995 return allocno_class
;
997 if (best_class
!= ALL_REGS
)
1000 mode
= PSEUDO_REGNO_MODE (regno
);
1001 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1005 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1007 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1008 return aarch64_tune_params
.min_div_recip_mul_sf
;
1009 return aarch64_tune_params
.min_div_recip_mul_df
;
1013 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
1016 if (VECTOR_MODE_P (mode
))
1017 return aarch64_tune_params
.vec_reassoc_width
;
1018 if (INTEGRAL_MODE_P (mode
))
1019 return aarch64_tune_params
.int_reassoc_width
;
1020 if (FLOAT_MODE_P (mode
))
1021 return aarch64_tune_params
.fp_reassoc_width
;
1025 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1027 aarch64_dbx_register_number (unsigned regno
)
1029 if (GP_REGNUM_P (regno
))
1030 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1031 else if (regno
== SP_REGNUM
)
1032 return AARCH64_DWARF_SP
;
1033 else if (FP_REGNUM_P (regno
))
1034 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1036 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1037 equivalent DWARF register. */
1038 return DWARF_FRAME_REGISTERS
;
1041 /* Return TRUE if MODE is any of the large INT modes. */
1043 aarch64_vect_struct_mode_p (machine_mode mode
)
1045 return mode
== OImode
|| mode
== CImode
|| mode
== XImode
;
1048 /* Return TRUE if MODE is any of the vector modes. */
1050 aarch64_vector_mode_p (machine_mode mode
)
1052 return aarch64_vector_mode_supported_p (mode
)
1053 || aarch64_vect_struct_mode_p (mode
);
1056 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1058 aarch64_array_mode_supported_p (machine_mode mode
,
1059 unsigned HOST_WIDE_INT nelems
)
1062 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1063 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1064 && (nelems
>= 2 && nelems
<= 4))
1070 /* Implement TARGET_HARD_REGNO_NREGS. */
1073 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1075 switch (aarch64_regno_regclass (regno
))
1079 return (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
;
1081 return (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
1086 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1089 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1091 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1092 return regno
== CC_REGNUM
;
1094 if (regno
== SP_REGNUM
)
1095 /* The purpose of comparing with ptr_mode is to support the
1096 global register variable associated with the stack pointer
1097 register via the syntax of asm ("wsp") in ILP32. */
1098 return mode
== Pmode
|| mode
== ptr_mode
;
1100 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1101 return mode
== Pmode
;
1103 if (GP_REGNUM_P (regno
) && ! aarch64_vect_struct_mode_p (mode
))
1106 if (FP_REGNUM_P (regno
))
1108 if (aarch64_vect_struct_mode_p (mode
))
1109 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1117 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1118 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1119 clobbers the top 64 bits when restoring the bottom 64 bits. */
1122 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1124 return FP_REGNUM_P (regno
) && GET_MODE_SIZE (mode
) > 8;
1127 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1129 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned nregs
,
1132 /* Handle modes that fit within single registers. */
1133 if (nregs
== 1 && GET_MODE_SIZE (mode
) <= 16)
1135 if (GET_MODE_SIZE (mode
) >= 4)
1140 /* Fall back to generic for multi-reg and very large modes. */
1142 return choose_hard_reg_mode (regno
, nregs
, false);
1145 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1146 that strcpy from constants will be faster. */
1148 static HOST_WIDE_INT
1149 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1151 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1152 return MAX (align
, BITS_PER_WORD
);
1156 /* Return true if calls to DECL should be treated as
1157 long-calls (ie called via a register). */
1159 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1164 /* Return true if calls to symbol-ref SYM should be treated as
1165 long-calls (ie called via a register). */
1167 aarch64_is_long_call_p (rtx sym
)
1169 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1172 /* Return true if calls to symbol-ref SYM should not go through
1176 aarch64_is_noplt_call_p (rtx sym
)
1178 const_tree decl
= SYMBOL_REF_DECL (sym
);
1183 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1184 && !targetm
.binds_local_p (decl
))
1190 /* Return true if the offsets to a zero/sign-extract operation
1191 represent an expression that matches an extend operation. The
1192 operands represent the paramters from
1194 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1196 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1199 HOST_WIDE_INT mult_val
, extract_val
;
1201 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1204 mult_val
= INTVAL (mult_imm
);
1205 extract_val
= INTVAL (extract_imm
);
1208 && extract_val
< GET_MODE_BITSIZE (mode
)
1209 && exact_log2 (extract_val
& ~7) > 0
1210 && (extract_val
& 7) <= 4
1211 && mult_val
== (1 << (extract_val
& 7)))
1217 /* Emit an insn that's a simple single-set. Both the operands must be
1218 known to be valid. */
1219 inline static rtx_insn
*
1220 emit_set_insn (rtx x
, rtx y
)
1222 return emit_insn (gen_rtx_SET (x
, y
));
1225 /* X and Y are two things to compare using CODE. Emit the compare insn and
1226 return the rtx for register 0 in the proper mode. */
1228 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1230 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1231 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1233 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1237 /* Build the SYMBOL_REF for __tls_get_addr. */
1239 static GTY(()) rtx tls_get_addr_libfunc
;
1242 aarch64_tls_get_addr (void)
1244 if (!tls_get_addr_libfunc
)
1245 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1246 return tls_get_addr_libfunc
;
1249 /* Return the TLS model to use for ADDR. */
1251 static enum tls_model
1252 tls_symbolic_operand_type (rtx addr
)
1254 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1257 if (GET_CODE (addr
) == CONST
)
1259 split_const (addr
, &sym
, &addend
);
1260 if (GET_CODE (sym
) == SYMBOL_REF
)
1261 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1263 else if (GET_CODE (addr
) == SYMBOL_REF
)
1264 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1269 /* We'll allow lo_sum's in addresses in our legitimate addresses
1270 so that combine would take care of combining addresses where
1271 necessary, but for generation purposes, we'll generate the address
1274 tmp = hi (symbol_ref); adrp x1, foo
1275 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1279 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1280 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1284 Load TLS symbol, depending on TLS mechanism and TLS access model.
1286 Global Dynamic - Traditional TLS:
1287 adrp tmp, :tlsgd:imm
1288 add dest, tmp, #:tlsgd_lo12:imm
1291 Global Dynamic - TLS Descriptors:
1292 adrp dest, :tlsdesc:imm
1293 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1294 add dest, dest, #:tlsdesc_lo12:imm
1301 adrp tmp, :gottprel:imm
1302 ldr dest, [tmp, #:gottprel_lo12:imm]
1307 add t0, tp, #:tprel_hi12:imm, lsl #12
1308 add t0, t0, #:tprel_lo12_nc:imm
1312 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1313 enum aarch64_symbol_type type
)
1317 case SYMBOL_SMALL_ABSOLUTE
:
1319 /* In ILP32, the mode of dest can be either SImode or DImode. */
1321 machine_mode mode
= GET_MODE (dest
);
1323 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1325 if (can_create_pseudo_p ())
1326 tmp_reg
= gen_reg_rtx (mode
);
1328 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1329 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1333 case SYMBOL_TINY_ABSOLUTE
:
1334 emit_insn (gen_rtx_SET (dest
, imm
));
1337 case SYMBOL_SMALL_GOT_28K
:
1339 machine_mode mode
= GET_MODE (dest
);
1340 rtx gp_rtx
= pic_offset_table_rtx
;
1344 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1345 here before rtl expand. Tree IVOPT will generate rtl pattern to
1346 decide rtx costs, in which case pic_offset_table_rtx is not
1347 initialized. For that case no need to generate the first adrp
1348 instruction as the final cost for global variable access is
1352 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1353 using the page base as GOT base, the first page may be wasted,
1354 in the worst scenario, there is only 28K space for GOT).
1356 The generate instruction sequence for accessing global variable
1359 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1361 Only one instruction needed. But we must initialize
1362 pic_offset_table_rtx properly. We generate initialize insn for
1363 every global access, and allow CSE to remove all redundant.
1365 The final instruction sequences will look like the following
1366 for multiply global variables access.
1368 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1370 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1371 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1372 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1375 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1376 crtl
->uses_pic_offset_table
= 1;
1377 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1379 if (mode
!= GET_MODE (gp_rtx
))
1380 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1384 if (mode
== ptr_mode
)
1387 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1389 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1391 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1395 gcc_assert (mode
== Pmode
);
1397 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1398 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1401 /* The operand is expected to be MEM. Whenever the related insn
1402 pattern changed, above code which calculate mem should be
1404 gcc_assert (GET_CODE (mem
) == MEM
);
1405 MEM_READONLY_P (mem
) = 1;
1406 MEM_NOTRAP_P (mem
) = 1;
1411 case SYMBOL_SMALL_GOT_4G
:
1413 /* In ILP32, the mode of dest can be either SImode or DImode,
1414 while the got entry is always of SImode size. The mode of
1415 dest depends on how dest is used: if dest is assigned to a
1416 pointer (e.g. in the memory), it has SImode; it may have
1417 DImode if dest is dereferenced to access the memeory.
1418 This is why we have to handle three different ldr_got_small
1419 patterns here (two patterns for ILP32). */
1424 machine_mode mode
= GET_MODE (dest
);
1426 if (can_create_pseudo_p ())
1427 tmp_reg
= gen_reg_rtx (mode
);
1429 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1430 if (mode
== ptr_mode
)
1433 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1435 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1437 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1441 gcc_assert (mode
== Pmode
);
1443 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1444 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1447 gcc_assert (GET_CODE (mem
) == MEM
);
1448 MEM_READONLY_P (mem
) = 1;
1449 MEM_NOTRAP_P (mem
) = 1;
1454 case SYMBOL_SMALL_TLSGD
:
1457 machine_mode mode
= GET_MODE (dest
);
1458 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1462 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1464 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1465 insns
= get_insns ();
1468 RTL_CONST_CALL_P (insns
) = 1;
1469 emit_libcall_block (insns
, dest
, result
, imm
);
1473 case SYMBOL_SMALL_TLSDESC
:
1475 machine_mode mode
= GET_MODE (dest
);
1476 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1479 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1481 /* In ILP32, the got entry is always of SImode size. Unlike
1482 small GOT, the dest is fixed at reg 0. */
1484 emit_insn (gen_tlsdesc_small_si (imm
));
1486 emit_insn (gen_tlsdesc_small_di (imm
));
1487 tp
= aarch64_load_tp (NULL
);
1490 tp
= gen_lowpart (mode
, tp
);
1492 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1494 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1498 case SYMBOL_SMALL_TLSIE
:
1500 /* In ILP32, the mode of dest can be either SImode or DImode,
1501 while the got entry is always of SImode size. The mode of
1502 dest depends on how dest is used: if dest is assigned to a
1503 pointer (e.g. in the memory), it has SImode; it may have
1504 DImode if dest is dereferenced to access the memeory.
1505 This is why we have to handle three different tlsie_small
1506 patterns here (two patterns for ILP32). */
1507 machine_mode mode
= GET_MODE (dest
);
1508 rtx tmp_reg
= gen_reg_rtx (mode
);
1509 rtx tp
= aarch64_load_tp (NULL
);
1511 if (mode
== ptr_mode
)
1514 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1517 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1518 tp
= gen_lowpart (mode
, tp
);
1523 gcc_assert (mode
== Pmode
);
1524 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1527 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1529 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1533 case SYMBOL_TLSLE12
:
1534 case SYMBOL_TLSLE24
:
1535 case SYMBOL_TLSLE32
:
1536 case SYMBOL_TLSLE48
:
1538 machine_mode mode
= GET_MODE (dest
);
1539 rtx tp
= aarch64_load_tp (NULL
);
1542 tp
= gen_lowpart (mode
, tp
);
1546 case SYMBOL_TLSLE12
:
1547 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1550 case SYMBOL_TLSLE24
:
1551 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1554 case SYMBOL_TLSLE32
:
1555 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1557 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1560 case SYMBOL_TLSLE48
:
1561 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1563 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1571 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1575 case SYMBOL_TINY_GOT
:
1576 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1579 case SYMBOL_TINY_TLSIE
:
1581 machine_mode mode
= GET_MODE (dest
);
1582 rtx tp
= aarch64_load_tp (NULL
);
1584 if (mode
== ptr_mode
)
1587 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1590 tp
= gen_lowpart (mode
, tp
);
1591 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1596 gcc_assert (mode
== Pmode
);
1597 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1601 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1610 /* Emit a move from SRC to DEST. Assume that the move expanders can
1611 handle all moves if !can_create_pseudo_p (). The distinction is
1612 important because, unlike emit_move_insn, the move expanders know
1613 how to force Pmode objects into the constant pool even when the
1614 constant pool address is not itself legitimate. */
1616 aarch64_emit_move (rtx dest
, rtx src
)
1618 return (can_create_pseudo_p ()
1619 ? emit_move_insn (dest
, src
)
1620 : emit_move_insn_1 (dest
, src
));
1623 /* Split a 128-bit move operation into two 64-bit move operations,
1624 taking care to handle partial overlap of register to register
1625 copies. Special cases are needed when moving between GP regs and
1626 FP regs. SRC can be a register, constant or memory; DST a register
1627 or memory. If either operand is memory it must not have any side
1630 aarch64_split_128bit_move (rtx dst
, rtx src
)
1635 machine_mode mode
= GET_MODE (dst
);
1637 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1638 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1639 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1641 if (REG_P (dst
) && REG_P (src
))
1643 int src_regno
= REGNO (src
);
1644 int dst_regno
= REGNO (dst
);
1646 /* Handle FP <-> GP regs. */
1647 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1649 src_lo
= gen_lowpart (word_mode
, src
);
1650 src_hi
= gen_highpart (word_mode
, src
);
1654 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1655 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1659 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1660 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1664 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1666 dst_lo
= gen_lowpart (word_mode
, dst
);
1667 dst_hi
= gen_highpart (word_mode
, dst
);
1671 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1672 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1676 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1677 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1683 dst_lo
= gen_lowpart (word_mode
, dst
);
1684 dst_hi
= gen_highpart (word_mode
, dst
);
1685 src_lo
= gen_lowpart (word_mode
, src
);
1686 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1688 /* At most one pairing may overlap. */
1689 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1691 aarch64_emit_move (dst_hi
, src_hi
);
1692 aarch64_emit_move (dst_lo
, src_lo
);
1696 aarch64_emit_move (dst_lo
, src_lo
);
1697 aarch64_emit_move (dst_hi
, src_hi
);
1702 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1704 return (! REG_P (src
)
1705 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1708 /* Split a complex SIMD combine. */
1711 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1713 machine_mode src_mode
= GET_MODE (src1
);
1714 machine_mode dst_mode
= GET_MODE (dst
);
1716 gcc_assert (VECTOR_MODE_P (dst_mode
));
1717 gcc_assert (register_operand (dst
, dst_mode
)
1718 && register_operand (src1
, src_mode
)
1719 && register_operand (src2
, src_mode
));
1721 rtx (*gen
) (rtx
, rtx
, rtx
);
1726 gen
= gen_aarch64_simd_combinev8qi
;
1729 gen
= gen_aarch64_simd_combinev4hi
;
1732 gen
= gen_aarch64_simd_combinev2si
;
1735 gen
= gen_aarch64_simd_combinev4hf
;
1738 gen
= gen_aarch64_simd_combinev2sf
;
1741 gen
= gen_aarch64_simd_combinedi
;
1744 gen
= gen_aarch64_simd_combinedf
;
1750 emit_insn (gen (dst
, src1
, src2
));
1754 /* Split a complex SIMD move. */
1757 aarch64_split_simd_move (rtx dst
, rtx src
)
1759 machine_mode src_mode
= GET_MODE (src
);
1760 machine_mode dst_mode
= GET_MODE (dst
);
1762 gcc_assert (VECTOR_MODE_P (dst_mode
));
1764 if (REG_P (dst
) && REG_P (src
))
1766 rtx (*gen
) (rtx
, rtx
);
1768 gcc_assert (VECTOR_MODE_P (src_mode
));
1773 gen
= gen_aarch64_split_simd_movv16qi
;
1776 gen
= gen_aarch64_split_simd_movv8hi
;
1779 gen
= gen_aarch64_split_simd_movv4si
;
1782 gen
= gen_aarch64_split_simd_movv2di
;
1785 gen
= gen_aarch64_split_simd_movv8hf
;
1788 gen
= gen_aarch64_split_simd_movv4sf
;
1791 gen
= gen_aarch64_split_simd_movv2df
;
1797 emit_insn (gen (dst
, src
));
1803 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
1804 machine_mode ymode
, rtx y
)
1806 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
1807 gcc_assert (r
!= NULL
);
1808 return rtx_equal_p (x
, r
);
1813 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
1815 if (can_create_pseudo_p ())
1816 return force_reg (mode
, value
);
1819 x
= aarch64_emit_move (x
, value
);
1826 aarch64_add_offset (scalar_int_mode mode
, rtx temp
, rtx reg
,
1827 HOST_WIDE_INT offset
)
1829 if (!aarch64_plus_immediate (GEN_INT (offset
), mode
))
1832 /* Load the full offset into a register. This
1833 might be improvable in the future. */
1834 high
= GEN_INT (offset
);
1836 high
= aarch64_force_temporary (mode
, temp
, high
);
1837 reg
= aarch64_force_temporary (mode
, temp
,
1838 gen_rtx_PLUS (mode
, high
, reg
));
1840 return plus_constant (mode
, reg
, offset
);
1844 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
1845 scalar_int_mode mode
)
1848 unsigned HOST_WIDE_INT val
, val2
, mask
;
1849 int one_match
, zero_match
;
1854 if (aarch64_move_imm (val
, mode
))
1857 emit_insn (gen_rtx_SET (dest
, imm
));
1861 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
1862 (with XXXX non-zero). In that case check to see if the move can be done in
1864 val2
= val
& 0xffffffff;
1866 && aarch64_move_imm (val2
, SImode
)
1867 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
1870 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
1872 /* Check if we have to emit a second instruction by checking to see
1873 if any of the upper 32 bits of the original DI mode value is set. */
1877 i
= (val
>> 48) ? 48 : 32;
1880 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1881 GEN_INT ((val
>> i
) & 0xffff)));
1886 if ((val
>> 32) == 0 || mode
== SImode
)
1890 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
1892 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
1893 GEN_INT ((val
>> 16) & 0xffff)));
1895 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
1896 GEN_INT ((val
>> 16) & 0xffff)));
1901 /* Remaining cases are all for DImode. */
1904 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
1905 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
1906 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
1907 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
1909 if (zero_match
!= 2 && one_match
!= 2)
1911 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1912 For a 64-bit bitmask try whether changing 16 bits to all ones or
1913 zeroes creates a valid bitmask. To check any repeated bitmask,
1914 try using 16 bits from the other 32-bit half of val. */
1916 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
1919 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1922 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1924 val2
= val2
& ~mask
;
1925 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
1926 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
1933 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
1934 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1935 GEN_INT ((val
>> i
) & 0xffff)));
1941 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1942 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1943 otherwise skip zero bits. */
1947 val2
= one_match
> zero_match
? ~val
: val
;
1948 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
1951 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
1952 ? (val
| ~(mask
<< i
))
1953 : (val
& (mask
<< i
)))));
1954 for (i
+= 16; i
< 64; i
+= 16)
1956 if ((val2
& (mask
<< i
)) == 0)
1959 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
1960 GEN_INT ((val
>> i
) & 0xffff)));
1969 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
1971 machine_mode mode
= GET_MODE (dest
);
1973 gcc_assert (mode
== SImode
|| mode
== DImode
);
1975 /* Check on what type of symbol it is. */
1976 scalar_int_mode int_mode
;
1977 if ((GET_CODE (imm
) == SYMBOL_REF
1978 || GET_CODE (imm
) == LABEL_REF
1979 || GET_CODE (imm
) == CONST
)
1980 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
1982 rtx mem
, base
, offset
;
1983 enum aarch64_symbol_type sty
;
1985 /* If we have (const (plus symbol offset)), separate out the offset
1986 before we start classifying the symbol. */
1987 split_const (imm
, &base
, &offset
);
1989 sty
= aarch64_classify_symbol (base
, offset
);
1992 case SYMBOL_FORCE_TO_MEM
:
1993 if (offset
!= const0_rtx
1994 && targetm
.cannot_force_const_mem (int_mode
, imm
))
1996 gcc_assert (can_create_pseudo_p ());
1997 base
= aarch64_force_temporary (int_mode
, dest
, base
);
1998 base
= aarch64_add_offset (int_mode
, NULL
, base
,
2000 aarch64_emit_move (dest
, base
);
2004 mem
= force_const_mem (ptr_mode
, imm
);
2007 /* If we aren't generating PC relative literals, then
2008 we need to expand the literal pool access carefully.
2009 This is something that needs to be done in a number
2010 of places, so could well live as a separate function. */
2011 if (!aarch64_pcrelative_literal_loads
)
2013 gcc_assert (can_create_pseudo_p ());
2014 base
= gen_reg_rtx (ptr_mode
);
2015 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
2016 if (ptr_mode
!= Pmode
)
2017 base
= convert_memory_address (Pmode
, base
);
2018 mem
= gen_rtx_MEM (ptr_mode
, base
);
2021 if (int_mode
!= ptr_mode
)
2022 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
2024 emit_insn (gen_rtx_SET (dest
, mem
));
2028 case SYMBOL_SMALL_TLSGD
:
2029 case SYMBOL_SMALL_TLSDESC
:
2030 case SYMBOL_SMALL_TLSIE
:
2031 case SYMBOL_SMALL_GOT_28K
:
2032 case SYMBOL_SMALL_GOT_4G
:
2033 case SYMBOL_TINY_GOT
:
2034 case SYMBOL_TINY_TLSIE
:
2035 if (offset
!= const0_rtx
)
2037 gcc_assert(can_create_pseudo_p ());
2038 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2039 base
= aarch64_add_offset (int_mode
, NULL
, base
,
2041 aarch64_emit_move (dest
, base
);
2046 case SYMBOL_SMALL_ABSOLUTE
:
2047 case SYMBOL_TINY_ABSOLUTE
:
2048 case SYMBOL_TLSLE12
:
2049 case SYMBOL_TLSLE24
:
2050 case SYMBOL_TLSLE32
:
2051 case SYMBOL_TLSLE48
:
2052 aarch64_load_symref_appropriately (dest
, imm
, sty
);
2060 if (!CONST_INT_P (imm
))
2062 if (GET_CODE (imm
) == HIGH
)
2063 emit_insn (gen_rtx_SET (dest
, imm
));
2066 rtx mem
= force_const_mem (mode
, imm
);
2068 emit_insn (gen_rtx_SET (dest
, mem
));
2074 aarch64_internal_mov_immediate (dest
, imm
, true,
2075 as_a
<scalar_int_mode
> (mode
));
2078 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
2079 temporary value if necessary. FRAME_RELATED_P should be true if
2080 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
2081 to the generated instructions. If SCRATCHREG is known to hold
2082 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
2085 Since this function may be used to adjust the stack pointer, we must
2086 ensure that it cannot cause transient stack deallocation (for example
2087 by first incrementing SP and then decrementing when adjusting by a
2088 large immediate). */
2091 aarch64_add_constant_internal (scalar_int_mode mode
, int regnum
,
2092 int scratchreg
, HOST_WIDE_INT delta
,
2093 bool frame_related_p
, bool emit_move_imm
)
2095 HOST_WIDE_INT mdelta
= abs_hwi (delta
);
2096 rtx this_rtx
= gen_rtx_REG (mode
, regnum
);
2102 /* Single instruction adjustment. */
2103 if (aarch64_uimm12_shift (mdelta
))
2105 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
)));
2106 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2110 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2111 Only do this if mdelta is not a 16-bit move as adjusting using a move
2113 if (mdelta
< 0x1000000 && !aarch64_move_imm (mdelta
, mode
))
2115 HOST_WIDE_INT low_off
= mdelta
& 0xfff;
2117 low_off
= delta
< 0 ? -low_off
: low_off
;
2118 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (low_off
)));
2119 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2120 insn
= emit_insn (gen_add2_insn (this_rtx
, GEN_INT (delta
- low_off
)));
2121 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2125 /* Emit a move immediate if required and an addition/subtraction. */
2126 rtx scratch_rtx
= gen_rtx_REG (mode
, scratchreg
);
2128 aarch64_internal_mov_immediate (scratch_rtx
, GEN_INT (mdelta
), true, mode
);
2129 insn
= emit_insn (delta
< 0 ? gen_sub2_insn (this_rtx
, scratch_rtx
)
2130 : gen_add2_insn (this_rtx
, scratch_rtx
));
2131 if (frame_related_p
)
2133 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2134 rtx adj
= plus_constant (mode
, this_rtx
, delta
);
2135 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (this_rtx
, adj
));
2140 aarch64_add_constant (scalar_int_mode mode
, int regnum
, int scratchreg
,
2141 HOST_WIDE_INT delta
)
2143 aarch64_add_constant_internal (mode
, regnum
, scratchreg
, delta
, false, true);
2147 aarch64_add_sp (int scratchreg
, HOST_WIDE_INT delta
, bool emit_move_imm
)
2149 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, delta
,
2150 true, emit_move_imm
);
2154 aarch64_sub_sp (int scratchreg
, HOST_WIDE_INT delta
, bool frame_related_p
)
2156 aarch64_add_constant_internal (Pmode
, SP_REGNUM
, scratchreg
, -delta
,
2157 frame_related_p
, true);
2161 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
2162 tree exp ATTRIBUTE_UNUSED
)
2164 /* Currently, always true. */
2168 /* Implement TARGET_PASS_BY_REFERENCE. */
2171 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
2174 bool named ATTRIBUTE_UNUSED
)
2177 machine_mode dummymode
;
2180 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2181 size
= (mode
== BLKmode
&& type
)
2182 ? int_size_in_bytes (type
) : (int) GET_MODE_SIZE (mode
);
2184 /* Aggregates are passed by reference based on their size. */
2185 if (type
&& AGGREGATE_TYPE_P (type
))
2187 size
= int_size_in_bytes (type
);
2190 /* Variable sized arguments are always returned by reference. */
2194 /* Can this be a candidate to be passed in fp/simd register(s)? */
2195 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2200 /* Arguments which are variable sized or larger than 2 registers are
2201 passed by reference unless they are a homogenous floating point
2203 return size
> 2 * UNITS_PER_WORD
;
2206 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2208 aarch64_return_in_msb (const_tree valtype
)
2210 machine_mode dummy_mode
;
2213 /* Never happens in little-endian mode. */
2214 if (!BYTES_BIG_ENDIAN
)
2217 /* Only composite types smaller than or equal to 16 bytes can
2218 be potentially returned in registers. */
2219 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
2220 || int_size_in_bytes (valtype
) <= 0
2221 || int_size_in_bytes (valtype
) > 16)
2224 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2225 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2226 is always passed/returned in the least significant bits of fp/simd
2228 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
2229 &dummy_mode
, &dummy_int
, NULL
))
2235 /* Implement TARGET_FUNCTION_VALUE.
2236 Define how to find the value returned by a function. */
2239 aarch64_function_value (const_tree type
, const_tree func
,
2240 bool outgoing ATTRIBUTE_UNUSED
)
2245 machine_mode ag_mode
;
2247 mode
= TYPE_MODE (type
);
2248 if (INTEGRAL_TYPE_P (type
))
2249 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
2251 if (aarch64_return_in_msb (type
))
2253 HOST_WIDE_INT size
= int_size_in_bytes (type
);
2255 if (size
% UNITS_PER_WORD
!= 0)
2257 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
2258 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
2262 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
2263 &ag_mode
, &count
, NULL
))
2265 if (!aarch64_composite_type_p (type
, mode
))
2267 gcc_assert (count
== 1 && mode
== ag_mode
);
2268 return gen_rtx_REG (mode
, V0_REGNUM
);
2275 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
2276 for (i
= 0; i
< count
; i
++)
2278 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
2279 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2280 GEN_INT (i
* GET_MODE_SIZE (ag_mode
)));
2281 XVECEXP (par
, 0, i
) = tmp
;
2287 return gen_rtx_REG (mode
, R0_REGNUM
);
2290 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2291 Return true if REGNO is the number of a hard register in which the values
2292 of called function may come back. */
2295 aarch64_function_value_regno_p (const unsigned int regno
)
2297 /* Maximum of 16 bytes can be returned in the general registers. Examples
2298 of 16-byte return values are: 128-bit integers and 16-byte small
2299 structures (excluding homogeneous floating-point aggregates). */
2300 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
2303 /* Up to four fp/simd registers can return a function value, e.g. a
2304 homogeneous floating-point aggregate having four members. */
2305 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
2306 return TARGET_FLOAT
;
2311 /* Implement TARGET_RETURN_IN_MEMORY.
2313 If the type T of the result of a function is such that
2315 would require that arg be passed as a value in a register (or set of
2316 registers) according to the parameter passing rules, then the result
2317 is returned in the same registers as would be used for such an
2321 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
2324 machine_mode ag_mode
;
2327 if (!AGGREGATE_TYPE_P (type
)
2328 && TREE_CODE (type
) != COMPLEX_TYPE
2329 && TREE_CODE (type
) != VECTOR_TYPE
)
2330 /* Simple scalar types always returned in registers. */
2333 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
2340 /* Types larger than 2 registers returned in memory. */
2341 size
= int_size_in_bytes (type
);
2342 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
2346 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
2347 const_tree type
, int *nregs
)
2349 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2350 return aarch64_vfp_is_call_or_return_candidate (mode
,
2352 &pcum
->aapcs_vfp_rmode
,
2357 /* Given MODE and TYPE of a function argument, return the alignment in
2358 bits. The idea is to suppress any stronger alignment requested by
2359 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2360 This is a helper function for local use only. */
2363 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
2366 return GET_MODE_ALIGNMENT (mode
);
2368 if (integer_zerop (TYPE_SIZE (type
)))
2371 gcc_assert (TYPE_MODE (type
) == mode
);
2373 if (!AGGREGATE_TYPE_P (type
))
2374 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
2376 if (TREE_CODE (type
) == ARRAY_TYPE
)
2377 return TYPE_ALIGN (TREE_TYPE (type
));
2379 unsigned int alignment
= 0;
2380 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
2381 if (TREE_CODE (field
) == FIELD_DECL
)
2382 alignment
= std::max (alignment
, DECL_ALIGN (field
));
2387 /* Layout a function argument according to the AAPCS64 rules. The rule
2388 numbers refer to the rule numbers in the AAPCS64. */
2391 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2393 bool named ATTRIBUTE_UNUSED
)
2395 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2396 int ncrn
, nvrn
, nregs
;
2397 bool allocate_ncrn
, allocate_nvrn
;
2400 /* We need to do this once per argument. */
2401 if (pcum
->aapcs_arg_processed
)
2404 pcum
->aapcs_arg_processed
= true;
2406 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2408 = ROUND_UP (type
? int_size_in_bytes (type
) : GET_MODE_SIZE (mode
),
2411 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
2412 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
2417 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2418 The following code thus handles passing by SIMD/FP registers first. */
2420 nvrn
= pcum
->aapcs_nvrn
;
2422 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2423 and homogenous short-vector aggregates (HVA). */
2427 aarch64_err_no_fpadvsimd (mode
, "argument");
2429 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
2431 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
2432 if (!aarch64_composite_type_p (type
, mode
))
2434 gcc_assert (nregs
== 1);
2435 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
2441 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2442 for (i
= 0; i
< nregs
; i
++)
2444 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
2445 V0_REGNUM
+ nvrn
+ i
);
2446 tmp
= gen_rtx_EXPR_LIST
2448 GEN_INT (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
)));
2449 XVECEXP (par
, 0, i
) = tmp
;
2451 pcum
->aapcs_reg
= par
;
2457 /* C.3 NSRN is set to 8. */
2458 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
2463 ncrn
= pcum
->aapcs_ncrn
;
2464 nregs
= size
/ UNITS_PER_WORD
;
2466 /* C6 - C9. though the sign and zero extension semantics are
2467 handled elsewhere. This is the case where the argument fits
2468 entirely general registers. */
2469 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
2472 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
2474 /* C.8 if the argument has an alignment of 16 then the NGRN is
2475 rounded up to the next even number. */
2478 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2479 comparison is there because for > 16 * BITS_PER_UNIT
2480 alignment nregs should be > 2 and therefore it should be
2481 passed by reference rather than value. */
2482 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2485 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
2488 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2489 A reg is still generated for it, but the caller should be smart
2490 enough not to use it. */
2491 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
2492 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
2498 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
2499 for (i
= 0; i
< nregs
; i
++)
2501 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
2502 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
2503 GEN_INT (i
* UNITS_PER_WORD
));
2504 XVECEXP (par
, 0, i
) = tmp
;
2506 pcum
->aapcs_reg
= par
;
2509 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
2514 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
2516 /* The argument is passed on stack; record the needed number of words for
2517 this argument and align the total size if necessary. */
2519 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
2521 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
2522 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
2523 16 / UNITS_PER_WORD
);
2527 /* Implement TARGET_FUNCTION_ARG. */
2530 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
2531 const_tree type
, bool named
)
2533 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2534 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
2536 if (mode
== VOIDmode
)
2539 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2540 return pcum
->aapcs_reg
;
2544 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
2545 const_tree fntype ATTRIBUTE_UNUSED
,
2546 rtx libname ATTRIBUTE_UNUSED
,
2547 const_tree fndecl ATTRIBUTE_UNUSED
,
2548 unsigned n_named ATTRIBUTE_UNUSED
)
2550 pcum
->aapcs_ncrn
= 0;
2551 pcum
->aapcs_nvrn
= 0;
2552 pcum
->aapcs_nextncrn
= 0;
2553 pcum
->aapcs_nextnvrn
= 0;
2554 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
2555 pcum
->aapcs_reg
= NULL_RTX
;
2556 pcum
->aapcs_arg_processed
= false;
2557 pcum
->aapcs_stack_words
= 0;
2558 pcum
->aapcs_stack_size
= 0;
2561 && fndecl
&& TREE_PUBLIC (fndecl
)
2562 && fntype
&& fntype
!= error_mark_node
)
2564 const_tree type
= TREE_TYPE (fntype
);
2565 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
2566 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
2567 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
2568 &mode
, &nregs
, NULL
))
2569 aarch64_err_no_fpadvsimd (TYPE_MODE (type
), "return type");
2575 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
2580 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
2581 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
2583 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
2584 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
2585 != (pcum
->aapcs_stack_words
!= 0));
2586 pcum
->aapcs_arg_processed
= false;
2587 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
2588 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
2589 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
2590 pcum
->aapcs_stack_words
= 0;
2591 pcum
->aapcs_reg
= NULL_RTX
;
2596 aarch64_function_arg_regno_p (unsigned regno
)
2598 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
2599 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
2602 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2603 PARM_BOUNDARY bits of alignment, but will be given anything up
2604 to STACK_BOUNDARY bits if the type requires it. This makes sure
2605 that both before and after the layout of each argument, the Next
2606 Stacked Argument Address (NSAA) will have a minimum alignment of
2610 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
2612 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
2613 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
2616 /* Implement TARGET_FUNCTION_ARG_PADDING.
2618 Small aggregate types are placed in the lowest memory address.
2620 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2622 static pad_direction
2623 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
2625 /* On little-endian targets, the least significant byte of every stack
2626 argument is passed at the lowest byte address of the stack slot. */
2627 if (!BYTES_BIG_ENDIAN
)
2630 /* Otherwise, integral, floating-point and pointer types are padded downward:
2631 the least significant byte of a stack argument is passed at the highest
2632 byte address of the stack slot. */
2634 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
2635 || POINTER_TYPE_P (type
))
2636 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
2637 return PAD_DOWNWARD
;
2639 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2643 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2645 It specifies padding for the last (may also be the only)
2646 element of a block move between registers and memory. If
2647 assuming the block is in the memory, padding upward means that
2648 the last element is padded after its highest significant byte,
2649 while in downward padding, the last element is padded at the
2650 its least significant byte side.
2652 Small aggregates and small complex types are always padded
2655 We don't need to worry about homogeneous floating-point or
2656 short-vector aggregates; their move is not affected by the
2657 padding direction determined here. Regardless of endianness,
2658 each element of such an aggregate is put in the least
2659 significant bits of a fp/simd register.
2661 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2662 register has useful data, and return the opposite if the most
2663 significant byte does. */
2666 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
2667 bool first ATTRIBUTE_UNUSED
)
2670 /* Small composite types are always padded upward. */
2671 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
2673 HOST_WIDE_INT size
= (type
? int_size_in_bytes (type
)
2674 : GET_MODE_SIZE (mode
));
2675 if (size
< 2 * UNITS_PER_WORD
)
2679 /* Otherwise, use the default padding. */
2680 return !BYTES_BIG_ENDIAN
;
2683 static scalar_int_mode
2684 aarch64_libgcc_cmp_return_mode (void)
2689 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2691 /* We use the 12-bit shifted immediate arithmetic instructions so values
2692 must be multiple of (1 << 12), i.e. 4096. */
2693 #define ARITH_FACTOR 4096
2695 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2696 #error Cannot use simple address calculation for stack probing
2699 /* The pair of scratch registers used for stack probing. */
2700 #define PROBE_STACK_FIRST_REG 9
2701 #define PROBE_STACK_SECOND_REG 10
2703 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2704 inclusive. These are offsets from the current stack pointer. */
2707 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, HOST_WIDE_INT size
)
2709 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
2711 /* See the same assertion on PROBE_INTERVAL above. */
2712 gcc_assert ((first
% ARITH_FACTOR
) == 0);
2714 /* See if we have a constant small number of probes to generate. If so,
2715 that's the easy case. */
2716 if (size
<= PROBE_INTERVAL
)
2718 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
2720 emit_set_insn (reg1
,
2721 plus_constant (Pmode
,
2722 stack_pointer_rtx
, -(first
+ base
)));
2723 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
2726 /* The run-time loop is made up of 8 insns in the generic case while the
2727 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2728 else if (size
<= 4 * PROBE_INTERVAL
)
2730 HOST_WIDE_INT i
, rem
;
2732 emit_set_insn (reg1
,
2733 plus_constant (Pmode
,
2735 -(first
+ PROBE_INTERVAL
)));
2736 emit_stack_probe (reg1
);
2738 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2739 it exceeds SIZE. If only two probes are needed, this will not
2740 generate any code. Then probe at FIRST + SIZE. */
2741 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
2743 emit_set_insn (reg1
,
2744 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
2745 emit_stack_probe (reg1
);
2748 rem
= size
- (i
- PROBE_INTERVAL
);
2751 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2753 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
2754 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
2757 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
2760 /* Otherwise, do the same as above, but in a loop. Note that we must be
2761 extra careful with variables wrapping around because we might be at
2762 the very top (or the very bottom) of the address space and we have
2763 to be able to handle this case properly; in particular, we use an
2764 equality test for the loop condition. */
2767 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
2769 /* Step 1: round SIZE to the previous multiple of the interval. */
2771 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
2774 /* Step 2: compute initial and final value of the loop counter. */
2776 /* TEST_ADDR = SP + FIRST. */
2777 emit_set_insn (reg1
,
2778 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
2780 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2781 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
2782 if (! aarch64_uimm12_shift (adjustment
))
2784 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
2786 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
2790 emit_set_insn (reg2
,
2791 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
2798 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2801 while (TEST_ADDR != LAST_ADDR)
2803 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2804 until it is equal to ROUNDED_SIZE. */
2806 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
2809 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2810 that SIZE is equal to ROUNDED_SIZE. */
2812 if (size
!= rounded_size
)
2814 HOST_WIDE_INT rem
= size
- rounded_size
;
2818 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
2820 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
2821 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
2824 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
2828 /* Make sure nothing is scheduled before we are done. */
2829 emit_insn (gen_blockage ());
2832 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2833 absolute addresses. */
2836 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
2838 static int labelno
= 0;
2842 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
2845 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
2847 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2849 xops
[1] = GEN_INT (PROBE_INTERVAL
);
2850 output_asm_insn ("sub\t%0, %0, %1", xops
);
2852 /* Probe at TEST_ADDR. */
2853 output_asm_insn ("str\txzr, [%0]", xops
);
2855 /* Test if TEST_ADDR == LAST_ADDR. */
2857 output_asm_insn ("cmp\t%0, %1", xops
);
2860 fputs ("\tb.ne\t", asm_out_file
);
2861 assemble_name_raw (asm_out_file
, loop_lab
);
2862 fputc ('\n', asm_out_file
);
2868 aarch64_frame_pointer_required (void)
2870 /* Use the frame pointer if enabled and it is not a leaf function, unless
2871 leaf frame pointer omission is disabled. If the frame pointer is enabled,
2872 force the frame pointer in leaf functions which use LR. */
2873 if (flag_omit_frame_pointer
== 2
2874 && !(flag_omit_leaf_frame_pointer
2876 && !df_regs_ever_live_p (LR_REGNUM
)))
2882 /* Mark the registers that need to be saved by the callee and calculate
2883 the size of the callee-saved registers area and frame record (both FP
2884 and LR may be omitted). If the function is not a leaf, ensure LR is
2885 saved at the bottom of the callee-save area. */
2887 aarch64_layout_frame (void)
2889 HOST_WIDE_INT offset
= 0;
2890 int regno
, last_fp_reg
= INVALID_REGNUM
;
2892 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
2895 /* Force a frame chain for EH returns so the return address is at FP+8. */
2896 cfun
->machine
->frame
.emit_frame_chain
2897 = frame_pointer_needed
|| crtl
->calls_eh_return
;
2899 #define SLOT_NOT_REQUIRED (-2)
2900 #define SLOT_REQUIRED (-1)
2902 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
2903 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
2905 /* First mark all the registers that really need to be saved... */
2906 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2907 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2909 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2910 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
2912 /* ... that includes the eh data registers (if needed)... */
2913 if (crtl
->calls_eh_return
)
2914 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
2915 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
2918 /* ... and any callee saved register that dataflow says is live. */
2919 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2920 if (df_regs_ever_live_p (regno
)
2921 && (regno
== R30_REGNUM
2922 || !call_used_regs
[regno
]))
2923 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2925 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2926 if (df_regs_ever_live_p (regno
)
2927 && !call_used_regs
[regno
])
2929 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
2930 last_fp_reg
= regno
;
2933 if (cfun
->machine
->frame
.emit_frame_chain
)
2935 /* FP and LR are placed in the linkage record. */
2936 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
2937 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
2938 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
2939 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
2940 offset
= 2 * UNITS_PER_WORD
;
2942 else if (!crtl
->is_leaf
)
2944 /* Ensure LR is saved at the bottom of the callee-saves. */
2945 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = 0;
2946 cfun
->machine
->frame
.wb_candidate1
= R30_REGNUM
;
2947 offset
= UNITS_PER_WORD
;
2950 /* Now assign stack slots for them. */
2951 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
2952 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2954 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2955 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2956 cfun
->machine
->frame
.wb_candidate1
= regno
;
2957 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
2958 cfun
->machine
->frame
.wb_candidate2
= regno
;
2959 offset
+= UNITS_PER_WORD
;
2962 HOST_WIDE_INT max_int_offset
= offset
;
2963 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2964 bool has_align_gap
= offset
!= max_int_offset
;
2966 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
2967 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
2969 /* If there is an alignment gap between integer and fp callee-saves,
2970 allocate the last fp register to it if possible. */
2971 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
2973 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
2977 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
2978 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
2979 cfun
->machine
->frame
.wb_candidate1
= regno
;
2980 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
2981 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
2982 cfun
->machine
->frame
.wb_candidate2
= regno
;
2983 offset
+= UNITS_PER_WORD
;
2986 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
2988 cfun
->machine
->frame
.saved_regs_size
= offset
;
2990 HOST_WIDE_INT varargs_and_saved_regs_size
2991 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
2993 cfun
->machine
->frame
.hard_fp_offset
2994 = ROUND_UP (varargs_and_saved_regs_size
+ get_frame_size (),
2995 STACK_BOUNDARY
/ BITS_PER_UNIT
);
2997 cfun
->machine
->frame
.frame_size
2998 = ROUND_UP (cfun
->machine
->frame
.hard_fp_offset
2999 + crtl
->outgoing_args_size
,
3000 STACK_BOUNDARY
/ BITS_PER_UNIT
);
3002 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
3004 cfun
->machine
->frame
.initial_adjust
= 0;
3005 cfun
->machine
->frame
.final_adjust
= 0;
3006 cfun
->machine
->frame
.callee_adjust
= 0;
3007 cfun
->machine
->frame
.callee_offset
= 0;
3009 HOST_WIDE_INT max_push_offset
= 0;
3010 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
3011 max_push_offset
= 512;
3012 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
3013 max_push_offset
= 256;
3015 if (cfun
->machine
->frame
.frame_size
< max_push_offset
3016 && crtl
->outgoing_args_size
== 0)
3018 /* Simple, small frame with no outgoing arguments:
3019 stp reg1, reg2, [sp, -frame_size]!
3020 stp reg3, reg4, [sp, 16] */
3021 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.frame_size
;
3023 else if ((crtl
->outgoing_args_size
3024 + cfun
->machine
->frame
.saved_regs_size
< 512)
3025 && !(cfun
->calls_alloca
3026 && cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
))
3028 /* Frame with small outgoing arguments:
3029 sub sp, sp, frame_size
3030 stp reg1, reg2, [sp, outgoing_args_size]
3031 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3032 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
3033 cfun
->machine
->frame
.callee_offset
3034 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
3036 else if (cfun
->machine
->frame
.hard_fp_offset
< max_push_offset
)
3038 /* Frame with large outgoing arguments but a small local area:
3039 stp reg1, reg2, [sp, -hard_fp_offset]!
3040 stp reg3, reg4, [sp, 16]
3041 sub sp, sp, outgoing_args_size */
3042 cfun
->machine
->frame
.callee_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3043 cfun
->machine
->frame
.final_adjust
3044 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
3048 /* Frame with large local area and outgoing arguments using frame pointer:
3049 sub sp, sp, hard_fp_offset
3050 stp x29, x30, [sp, 0]
3052 stp reg3, reg4, [sp, 16]
3053 sub sp, sp, outgoing_args_size */
3054 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3055 cfun
->machine
->frame
.final_adjust
3056 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
3059 cfun
->machine
->frame
.laid_out
= true;
3062 /* Return true if the register REGNO is saved on entry to
3063 the current function. */
3066 aarch64_register_saved_on_entry (int regno
)
3068 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
3071 /* Return the next register up from REGNO up to LIMIT for the callee
3075 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
3077 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
3082 /* Push the register number REGNO of mode MODE to the stack with write-back
3083 adjusting the stack by ADJUSTMENT. */
3086 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
3087 HOST_WIDE_INT adjustment
)
3089 rtx base_rtx
= stack_pointer_rtx
;
3092 reg
= gen_rtx_REG (mode
, regno
);
3093 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
3094 plus_constant (Pmode
, base_rtx
, -adjustment
));
3095 mem
= gen_frame_mem (mode
, mem
);
3097 insn
= emit_move_insn (mem
, reg
);
3098 RTX_FRAME_RELATED_P (insn
) = 1;
3101 /* Generate and return an instruction to store the pair of registers
3102 REG and REG2 of mode MODE to location BASE with write-back adjusting
3103 the stack location BASE by ADJUSTMENT. */
3106 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3107 HOST_WIDE_INT adjustment
)
3112 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
3113 GEN_INT (-adjustment
),
3114 GEN_INT (UNITS_PER_WORD
- adjustment
));
3116 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
3117 GEN_INT (-adjustment
),
3118 GEN_INT (UNITS_PER_WORD
- adjustment
));
3124 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3125 stack pointer by ADJUSTMENT. */
3128 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
3131 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
3133 if (regno2
== INVALID_REGNUM
)
3134 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
3136 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3137 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3139 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
3141 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
3142 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3143 RTX_FRAME_RELATED_P (insn
) = 1;
3146 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3147 adjusting it by ADJUSTMENT afterwards. */
3150 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
3151 HOST_WIDE_INT adjustment
)
3156 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3157 GEN_INT (UNITS_PER_WORD
));
3159 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
3160 GEN_INT (UNITS_PER_WORD
));
3166 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3167 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3171 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
3174 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
3175 rtx reg1
= gen_rtx_REG (mode
, regno1
);
3177 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
3179 if (regno2
== INVALID_REGNUM
)
3181 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
3182 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
3183 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
3187 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3188 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3189 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
3194 /* Generate and return a store pair instruction of mode MODE to store
3195 register REG1 to MEM1 and register REG2 to MEM2. */
3198 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
3204 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
3207 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
3214 /* Generate and regurn a load pair isntruction of mode MODE to load register
3215 REG1 from MEM1 and register REG2 from MEM2. */
3218 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
3224 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
3227 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
3234 /* Return TRUE if return address signing should be enabled for the current
3235 function, otherwise return FALSE. */
3238 aarch64_return_address_signing_enabled (void)
3240 /* This function should only be called after frame laid out. */
3241 gcc_assert (cfun
->machine
->frame
.laid_out
);
3243 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3244 if it's LR is pushed onto stack. */
3245 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
3246 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
3247 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
3250 /* Emit code to save the callee-saved registers from register number START
3251 to LIMIT to the stack at the location starting at offset START_OFFSET,
3252 skipping any write-back candidates if SKIP_WB is true. */
3255 aarch64_save_callee_saves (machine_mode mode
, HOST_WIDE_INT start_offset
,
3256 unsigned start
, unsigned limit
, bool skip_wb
)
3262 for (regno
= aarch64_next_callee_save (start
, limit
);
3264 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3267 HOST_WIDE_INT offset
;
3270 && (regno
== cfun
->machine
->frame
.wb_candidate1
3271 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3274 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
3277 reg
= gen_rtx_REG (mode
, regno
);
3278 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3279 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3282 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3285 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
3286 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3287 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3290 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3293 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3294 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
3296 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
3299 /* The first part of a frame-related parallel insn is
3300 always assumed to be relevant to the frame
3301 calculations; subsequent parts, are only
3302 frame-related if explicitly marked. */
3303 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
3307 insn
= emit_move_insn (mem
, reg
);
3309 RTX_FRAME_RELATED_P (insn
) = 1;
3313 /* Emit code to restore the callee registers of mode MODE from register
3314 number START up to and including LIMIT. Restore from the stack offset
3315 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3316 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3319 aarch64_restore_callee_saves (machine_mode mode
,
3320 HOST_WIDE_INT start_offset
, unsigned start
,
3321 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
3323 rtx base_rtx
= stack_pointer_rtx
;
3326 HOST_WIDE_INT offset
;
3328 for (regno
= aarch64_next_callee_save (start
, limit
);
3330 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
3332 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
3338 && (regno
== cfun
->machine
->frame
.wb_candidate1
3339 || regno
== cfun
->machine
->frame
.wb_candidate2
))
3342 reg
= gen_rtx_REG (mode
, regno
);
3343 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
3344 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3346 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
3349 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
3350 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
3351 == cfun
->machine
->frame
.reg_offset
[regno2
]))
3353 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3356 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
3357 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
3358 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3360 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
3364 emit_move_insn (reg
, mem
);
3365 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
3370 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
3371 HOST_WIDE_INT offset
)
3373 return offset
>= -256 && offset
< 256;
3377 offset_12bit_unsigned_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3380 && offset
< 4096 * GET_MODE_SIZE (mode
)
3381 && offset
% GET_MODE_SIZE (mode
) == 0);
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, HOST_WIDE_INT offset
)
3387 return (offset
>= -64 * GET_MODE_SIZE (mode
)
3388 && offset
< 64 * GET_MODE_SIZE (mode
)
3389 && offset
% GET_MODE_SIZE (mode
) == 0);
3392 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3395 aarch64_get_separate_components (void)
3397 aarch64_layout_frame ();
3399 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
3400 bitmap_clear (components
);
3402 /* The registers we need saved to the frame. */
3403 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3404 if (aarch64_register_saved_on_entry (regno
))
3406 HOST_WIDE_INT offset
= cfun
->machine
->frame
.reg_offset
[regno
];
3407 if (!frame_pointer_needed
)
3408 offset
+= cfun
->machine
->frame
.frame_size
3409 - cfun
->machine
->frame
.hard_fp_offset
;
3410 /* Check that we can access the stack slot of the register with one
3411 direct load with no adjustments needed. */
3412 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
3413 bitmap_set_bit (components
, regno
);
3416 /* Don't mess with the hard frame pointer. */
3417 if (frame_pointer_needed
)
3418 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
3420 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3421 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3422 /* If aarch64_layout_frame has chosen registers to store/restore with
3423 writeback don't interfere with them to avoid having to output explicit
3424 stack adjustment instructions. */
3425 if (reg2
!= INVALID_REGNUM
)
3426 bitmap_clear_bit (components
, reg2
);
3427 if (reg1
!= INVALID_REGNUM
)
3428 bitmap_clear_bit (components
, reg1
);
3430 bitmap_clear_bit (components
, LR_REGNUM
);
3431 bitmap_clear_bit (components
, SP_REGNUM
);
3436 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3439 aarch64_components_for_bb (basic_block bb
)
3441 bitmap in
= DF_LIVE_IN (bb
);
3442 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
3443 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
3445 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
3446 bitmap_clear (components
);
3448 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3449 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3450 if ((!call_used_regs
[regno
])
3451 && (bitmap_bit_p (in
, regno
)
3452 || bitmap_bit_p (gen
, regno
)
3453 || bitmap_bit_p (kill
, regno
)))
3454 bitmap_set_bit (components
, regno
);
3459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3460 Nothing to do for aarch64. */
3463 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
3467 /* Return the next set bit in BMP from START onwards. Return the total number
3468 of bits in BMP if no set bit is found at or after START. */
3471 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
3473 unsigned int nbits
= SBITMAP_SIZE (bmp
);
3477 gcc_assert (start
< nbits
);
3478 for (unsigned int i
= start
; i
< nbits
; i
++)
3479 if (bitmap_bit_p (bmp
, i
))
3485 /* Do the work for aarch64_emit_prologue_components and
3486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3488 for these components or the epilogue sequence. That is, it determines
3489 whether we should emit stores or loads and what kind of CFA notes to attach
3490 to the insns. Otherwise the logic for the two sequences is very
3494 aarch64_process_components (sbitmap components
, bool prologue_p
)
3496 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
3497 ? HARD_FRAME_POINTER_REGNUM
3498 : STACK_POINTER_REGNUM
);
3500 unsigned last_regno
= SBITMAP_SIZE (components
);
3501 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
3502 rtx_insn
*insn
= NULL
;
3504 while (regno
!= last_regno
)
3506 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3507 so DFmode for the vector registers is enough. */
3508 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
3509 rtx reg
= gen_rtx_REG (mode
, regno
);
3510 HOST_WIDE_INT offset
= cfun
->machine
->frame
.reg_offset
[regno
];
3511 if (!frame_pointer_needed
)
3512 offset
+= cfun
->machine
->frame
.frame_size
3513 - cfun
->machine
->frame
.hard_fp_offset
;
3514 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
3515 rtx mem
= gen_frame_mem (mode
, addr
);
3517 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
3518 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
3519 /* No more registers to handle after REGNO.
3520 Emit a single save/restore and exit. */
3521 if (regno2
== last_regno
)
3523 insn
= emit_insn (set
);
3524 RTX_FRAME_RELATED_P (insn
) = 1;
3526 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
3528 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3532 HOST_WIDE_INT offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
3533 /* The next register is not of the same class or its offset is not
3534 mergeable with the current one into a pair. */
3535 if (!satisfies_constraint_Ump (mem
)
3536 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
3537 || (offset2
- cfun
->machine
->frame
.reg_offset
[regno
])
3538 != GET_MODE_SIZE (mode
))
3540 insn
= emit_insn (set
);
3541 RTX_FRAME_RELATED_P (insn
) = 1;
3543 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
3545 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3551 /* REGNO2 can be saved/restored in a pair with REGNO. */
3552 rtx reg2
= gen_rtx_REG (mode
, regno2
);
3553 if (!frame_pointer_needed
)
3554 offset2
+= cfun
->machine
->frame
.frame_size
3555 - cfun
->machine
->frame
.hard_fp_offset
;
3556 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
3557 rtx mem2
= gen_frame_mem (mode
, addr2
);
3558 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
3559 : gen_rtx_SET (reg2
, mem2
);
3562 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
3564 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
3566 RTX_FRAME_RELATED_P (insn
) = 1;
3569 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
3570 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
3574 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
3575 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
3578 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
3582 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3585 aarch64_emit_prologue_components (sbitmap components
)
3587 aarch64_process_components (components
, true);
3590 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3593 aarch64_emit_epilogue_components (sbitmap components
)
3595 aarch64_process_components (components
, false);
3598 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3601 aarch64_set_handled_components (sbitmap components
)
3603 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
3604 if (bitmap_bit_p (components
, regno
))
3605 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
3608 /* AArch64 stack frames generated by this compiler look like:
3610 +-------------------------------+
3612 | incoming stack arguments |
3614 +-------------------------------+
3615 | | <-- incoming stack pointer (aligned)
3616 | callee-allocated save area |
3617 | for register varargs |
3619 +-------------------------------+
3620 | local variables | <-- frame_pointer_rtx
3622 +-------------------------------+
3624 +-------------------------------+ |
3625 | callee-saved registers | | frame.saved_regs_size
3626 +-------------------------------+ |
3628 +-------------------------------+ |
3629 | FP' | / <- hard_frame_pointer_rtx (aligned)
3630 +-------------------------------+
3631 | dynamic allocation |
3632 +-------------------------------+
3634 +-------------------------------+
3635 | outgoing stack arguments | <-- arg_pointer
3637 +-------------------------------+
3638 | | <-- stack_pointer_rtx (aligned)
3640 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3641 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3644 /* Generate the prologue instructions for entry into a function.
3645 Establish the stack frame by decreasing the stack pointer with a
3646 properly calculated size and, if necessary, create a frame record
3647 filled with the values of LR and previous frame pointer. The
3648 current FP is also set up if it is in use. */
3651 aarch64_expand_prologue (void)
3653 aarch64_layout_frame ();
3655 HOST_WIDE_INT frame_size
= cfun
->machine
->frame
.frame_size
;
3656 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3657 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3658 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3659 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3660 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3661 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3662 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
3665 /* Sign return address for functions. */
3666 if (aarch64_return_address_signing_enabled ())
3668 insn
= emit_insn (gen_pacisp ());
3669 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
3670 RTX_FRAME_RELATED_P (insn
) = 1;
3673 if (flag_stack_usage_info
)
3674 current_function_static_stack_size
= frame_size
;
3676 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
3678 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
3680 if (frame_size
> PROBE_INTERVAL
3681 && frame_size
> get_stack_check_protect ())
3682 aarch64_emit_probe_stack_range (get_stack_check_protect (),
3684 - get_stack_check_protect ()));
3686 else if (frame_size
> 0)
3687 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
3690 aarch64_sub_sp (IP0_REGNUM
, initial_adjust
, true);
3692 if (callee_adjust
!= 0)
3693 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
3695 if (emit_frame_chain
)
3697 if (callee_adjust
== 0)
3698 aarch64_save_callee_saves (DImode
, callee_offset
, R29_REGNUM
,
3700 insn
= emit_insn (gen_add3_insn (hard_frame_pointer_rtx
,
3702 GEN_INT (callee_offset
)));
3703 RTX_FRAME_RELATED_P (insn
) = frame_pointer_needed
;
3704 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
3707 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3708 callee_adjust
!= 0 || emit_frame_chain
);
3709 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3710 callee_adjust
!= 0 || emit_frame_chain
);
3711 aarch64_sub_sp (IP1_REGNUM
, final_adjust
, !frame_pointer_needed
);
3714 /* Return TRUE if we can use a simple_return insn.
3716 This function checks whether the callee saved stack is empty, which
3717 means no restore actions are need. The pro_and_epilogue will use
3718 this to check whether shrink-wrapping opt is feasible. */
3721 aarch64_use_return_insn_p (void)
3723 if (!reload_completed
)
3729 aarch64_layout_frame ();
3731 return cfun
->machine
->frame
.frame_size
== 0;
3734 /* Generate the epilogue instructions for returning from a function.
3735 This is almost exactly the reverse of the prolog sequence, except
3736 that we need to insert barriers to avoid scheduling loads that read
3737 from a deallocated stack, and we optimize the unwind records by
3738 emitting them all together if possible. */
3740 aarch64_expand_epilogue (bool for_sibcall
)
3742 aarch64_layout_frame ();
3744 HOST_WIDE_INT initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
3745 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
3746 HOST_WIDE_INT final_adjust
= cfun
->machine
->frame
.final_adjust
;
3747 HOST_WIDE_INT callee_offset
= cfun
->machine
->frame
.callee_offset
;
3748 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
3749 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
3753 /* We need to add memory barrier to prevent read from deallocated stack. */
3754 bool need_barrier_p
= (get_frame_size ()
3755 + cfun
->machine
->frame
.saved_varargs_size
) != 0;
3757 /* Emit a barrier to prevent loads from a deallocated stack. */
3758 if (final_adjust
> crtl
->outgoing_args_size
|| cfun
->calls_alloca
3759 || crtl
->calls_eh_return
)
3761 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3762 need_barrier_p
= false;
3765 /* Restore the stack pointer from the frame pointer if it may not
3766 be the same as the stack pointer. */
3767 if (frame_pointer_needed
&& (final_adjust
|| cfun
->calls_alloca
))
3769 insn
= emit_insn (gen_add3_insn (stack_pointer_rtx
,
3770 hard_frame_pointer_rtx
,
3771 GEN_INT (-callee_offset
)));
3772 /* If writeback is used when restoring callee-saves, the CFA
3773 is restored on the instruction doing the writeback. */
3774 RTX_FRAME_RELATED_P (insn
) = callee_adjust
== 0;
3777 aarch64_add_sp (IP1_REGNUM
, final_adjust
, df_regs_ever_live_p (IP1_REGNUM
));
3779 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
3780 callee_adjust
!= 0, &cfi_ops
);
3781 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
3782 callee_adjust
!= 0, &cfi_ops
);
3785 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
3787 if (callee_adjust
!= 0)
3788 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
3790 if (callee_adjust
!= 0 || initial_adjust
> 65536)
3792 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3793 insn
= get_last_insn ();
3794 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
3795 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
3796 RTX_FRAME_RELATED_P (insn
) = 1;
3800 aarch64_add_sp (IP0_REGNUM
, initial_adjust
, df_regs_ever_live_p (IP0_REGNUM
));
3804 /* Emit delayed restores and reset the CFA to be SP. */
3805 insn
= get_last_insn ();
3806 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
3807 REG_NOTES (insn
) = cfi_ops
;
3808 RTX_FRAME_RELATED_P (insn
) = 1;
3811 /* We prefer to emit the combined return/authenticate instruction RETAA,
3812 however there are three cases in which we must instead emit an explicit
3813 authentication instruction.
3815 1) Sibcalls don't return in a normal way, so if we're about to call one
3816 we must authenticate.
3818 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3819 generating code for !TARGET_ARMV8_3 we can't use it and must
3820 explicitly authenticate.
3822 3) On an eh_return path we make extra stack adjustments to update the
3823 canonical frame address to be the exception handler's CFA. We want
3824 to authenticate using the CFA of the function which calls eh_return.
3826 if (aarch64_return_address_signing_enabled ()
3827 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
3829 insn
= emit_insn (gen_autisp ());
3830 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
3831 RTX_FRAME_RELATED_P (insn
) = 1;
3834 /* Stack adjustment for exception handler. */
3835 if (crtl
->calls_eh_return
)
3837 /* We need to unwind the stack by the offset computed by
3838 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3839 to be SP; letting the CFA move during this adjustment
3840 is just as correct as retaining the CFA from the body
3841 of the function. Therefore, do nothing special. */
3842 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
3845 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
3847 emit_jump_insn (ret_rtx
);
3850 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3851 normally or return to a previous frame after unwinding.
3853 An EH return uses a single shared return sequence. The epilogue is
3854 exactly like a normal epilogue except that it has an extra input
3855 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3856 that must be applied after the frame has been destroyed. An extra label
3857 is inserted before the epilogue which initializes this register to zero,
3858 and this is the entry point for a normal return.
3860 An actual EH return updates the return address, initializes the stack
3861 adjustment and jumps directly into the epilogue (bypassing the zeroing
3862 of the adjustment). Since the return address is typically saved on the
3863 stack when a function makes a call, the saved LR must be updated outside
3866 This poses problems as the store is generated well before the epilogue,
3867 so the offset of LR is not known yet. Also optimizations will remove the
3868 store as it appears dead, even after the epilogue is generated (as the
3869 base or offset for loading LR is different in many cases).
3871 To avoid these problems this implementation forces the frame pointer
3872 in eh_return functions so that the location of LR is fixed and known early.
3873 It also marks the store volatile, so no optimization is permitted to
3874 remove the store. */
3876 aarch64_eh_return_handler_rtx (void)
3878 rtx tmp
= gen_frame_mem (Pmode
,
3879 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
3881 /* Mark the store volatile, so no optimization is permitted to remove it. */
3882 MEM_VOLATILE_P (tmp
) = true;
3886 /* Output code to add DELTA to the first argument, and then jump
3887 to FUNCTION. Used for C++ multiple inheritance. */
3889 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
3890 HOST_WIDE_INT delta
,
3891 HOST_WIDE_INT vcall_offset
,
3894 /* The this pointer is always in x0. Note that this differs from
3895 Arm where the this pointer maybe bumped to r1 if r0 is required
3896 to return a pointer to an aggregate. On AArch64 a result value
3897 pointer will be in x8. */
3898 int this_regno
= R0_REGNUM
;
3899 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
3902 reload_completed
= 1;
3903 emit_note (NOTE_INSN_PROLOGUE_END
);
3905 if (vcall_offset
== 0)
3906 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3909 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
3911 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
3912 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
3913 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
3918 if (delta
>= -256 && delta
< 256)
3919 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
3920 plus_constant (Pmode
, this_rtx
, delta
));
3922 aarch64_add_constant (Pmode
, this_regno
, IP1_REGNUM
, delta
);
3925 if (Pmode
== ptr_mode
)
3926 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
3928 aarch64_emit_move (temp0
,
3929 gen_rtx_ZERO_EXTEND (Pmode
,
3930 gen_rtx_MEM (ptr_mode
, addr
)));
3932 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
3933 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
3936 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
3938 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
3941 if (Pmode
== ptr_mode
)
3942 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
3944 aarch64_emit_move (temp1
,
3945 gen_rtx_SIGN_EXTEND (Pmode
,
3946 gen_rtx_MEM (ptr_mode
, addr
)));
3948 emit_insn (gen_add2_insn (this_rtx
, temp1
));
3951 /* Generate a tail call to the target function. */
3952 if (!TREE_USED (function
))
3954 assemble_external (function
);
3955 TREE_USED (function
) = 1;
3957 funexp
= XEXP (DECL_RTL (function
), 0);
3958 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
3959 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
3960 SIBLING_CALL_P (insn
) = 1;
3962 insn
= get_insns ();
3963 shorten_branches (insn
);
3964 final_start_function (insn
, file
, 1);
3965 final (insn
, file
, 1);
3966 final_end_function ();
3968 /* Stop pretending to be a post-reload pass. */
3969 reload_completed
= 0;
3973 aarch64_tls_referenced_p (rtx x
)
3975 if (!TARGET_HAVE_TLS
)
3977 subrtx_iterator::array_type array
;
3978 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
3980 const_rtx x
= *iter
;
3981 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
3983 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3984 TLS offsets, not real symbol references. */
3985 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
3986 iter
.skip_subrtxes ();
3992 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3993 a left shift of 0 or 12 bits. */
3995 aarch64_uimm12_shift (HOST_WIDE_INT val
)
3997 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
3998 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
4003 /* Return true if val is an immediate that can be loaded into a
4004 register by a MOVZ instruction. */
4006 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
4008 if (GET_MODE_SIZE (mode
) > 4)
4010 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
4011 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
4016 /* Ignore sign extension. */
4017 val
&= (HOST_WIDE_INT
) 0xffffffff;
4019 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
4020 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
4023 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
4025 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
4027 0x0000000100000001ull
,
4028 0x0001000100010001ull
,
4029 0x0101010101010101ull
,
4030 0x1111111111111111ull
,
4031 0x5555555555555555ull
,
4035 /* Return true if val is a valid bitmask immediate. */
4038 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
4040 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
4043 /* Check for a single sequence of one bits and return quickly if so.
4044 The special cases of all ones and all zeroes returns false. */
4045 val
= (unsigned HOST_WIDE_INT
) val_in
;
4046 tmp
= val
+ (val
& -val
);
4048 if (tmp
== (tmp
& -tmp
))
4049 return (val
+ 1) > 1;
4051 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4053 val
= (val
<< 32) | (val
& 0xffffffff);
4055 /* Invert if the immediate doesn't start with a zero bit - this means we
4056 only need to search for sequences of one bits. */
4060 /* Find the first set bit and set tmp to val with the first sequence of one
4061 bits removed. Return success if there is a single sequence of ones. */
4062 first_one
= val
& -val
;
4063 tmp
= val
& (val
+ first_one
);
4068 /* Find the next set bit and compute the difference in bit position. */
4069 next_one
= tmp
& -tmp
;
4070 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
4073 /* Check the bit position difference is a power of 2, and that the first
4074 sequence of one bits fits within 'bits' bits. */
4075 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
4078 /* Check the sequence of one bits is repeated 64/bits times. */
4079 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
4082 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4083 Assumed precondition: VAL_IN Is not zero. */
4085 unsigned HOST_WIDE_INT
4086 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
4088 int lowest_bit_set
= ctz_hwi (val_in
);
4089 int highest_bit_set
= floor_log2 (val_in
);
4090 gcc_assert (val_in
!= 0);
4092 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
4093 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
4096 /* Create constant where bits outside of lowest bit set to highest bit set
4099 unsigned HOST_WIDE_INT
4100 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
4102 return val_in
| ~aarch64_and_split_imm1 (val_in
);
4105 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4108 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
4110 scalar_int_mode int_mode
;
4111 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
4114 if (aarch64_bitmask_imm (val_in
, int_mode
))
4117 if (aarch64_move_imm (val_in
, int_mode
))
4120 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
4122 return aarch64_bitmask_imm (imm2
, int_mode
);
4125 /* Return true if val is an immediate that can be loaded into a
4126 register in a single instruction. */
4128 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
4130 scalar_int_mode int_mode
;
4131 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
4134 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
4136 return aarch64_bitmask_imm (val
, int_mode
);
4140 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
4144 if (GET_CODE (x
) == HIGH
)
4147 split_const (x
, &base
, &offset
);
4148 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
4150 if (aarch64_classify_symbol (base
, offset
)
4151 != SYMBOL_FORCE_TO_MEM
)
4154 /* Avoid generating a 64-bit relocation in ILP32; leave
4155 to aarch64_expand_mov_immediate to handle it properly. */
4156 return mode
!= ptr_mode
;
4159 return aarch64_tls_referenced_p (x
);
4162 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4163 The expansion for a table switch is quite expensive due to the number
4164 of instructions, the table lookup and hard to predict indirect jump.
4165 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4166 set, otherwise use tables for > 16 cases as a tradeoff between size and
4167 performance. When optimizing for size, use the default setting. */
4170 aarch64_case_values_threshold (void)
4172 /* Use the specified limit for the number of cases before using jump
4173 tables at higher optimization levels. */
4175 && selected_cpu
->tune
->max_case_values
!= 0)
4176 return selected_cpu
->tune
->max_case_values
;
4178 return optimize_size
? default_case_values_threshold () : 17;
4181 /* Return true if register REGNO is a valid index register.
4182 STRICT_P is true if REG_OK_STRICT is in effect. */
4185 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
4187 if (!HARD_REGISTER_NUM_P (regno
))
4195 regno
= reg_renumber
[regno
];
4197 return GP_REGNUM_P (regno
);
4200 /* Return true if register REGNO is a valid base register for mode MODE.
4201 STRICT_P is true if REG_OK_STRICT is in effect. */
4204 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
4206 if (!HARD_REGISTER_NUM_P (regno
))
4214 regno
= reg_renumber
[regno
];
4217 /* The fake registers will be eliminated to either the stack or
4218 hard frame pointer, both of which are usually valid base registers.
4219 Reload deals with the cases where the eliminated form isn't valid. */
4220 return (GP_REGNUM_P (regno
)
4221 || regno
== SP_REGNUM
4222 || regno
== FRAME_POINTER_REGNUM
4223 || regno
== ARG_POINTER_REGNUM
);
4226 /* Return true if X is a valid base register for mode MODE.
4227 STRICT_P is true if REG_OK_STRICT is in effect. */
4230 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
4233 && GET_CODE (x
) == SUBREG
4234 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
4237 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
4240 /* Return true if address offset is a valid index. If it is, fill in INFO
4241 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4244 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
4245 machine_mode mode
, bool strict_p
)
4247 enum aarch64_address_type type
;
4252 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
4253 && GET_MODE (x
) == Pmode
)
4255 type
= ADDRESS_REG_REG
;
4259 /* (sign_extend:DI (reg:SI)) */
4260 else if ((GET_CODE (x
) == SIGN_EXTEND
4261 || GET_CODE (x
) == ZERO_EXTEND
)
4262 && GET_MODE (x
) == DImode
4263 && GET_MODE (XEXP (x
, 0)) == SImode
)
4265 type
= (GET_CODE (x
) == SIGN_EXTEND
)
4266 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4267 index
= XEXP (x
, 0);
4270 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4271 else if (GET_CODE (x
) == MULT
4272 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
4273 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
4274 && GET_MODE (XEXP (x
, 0)) == DImode
4275 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
4276 && CONST_INT_P (XEXP (x
, 1)))
4278 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
4279 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4280 index
= XEXP (XEXP (x
, 0), 0);
4281 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
4283 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4284 else if (GET_CODE (x
) == ASHIFT
4285 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
4286 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
4287 && GET_MODE (XEXP (x
, 0)) == DImode
4288 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
4289 && CONST_INT_P (XEXP (x
, 1)))
4291 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
4292 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4293 index
= XEXP (XEXP (x
, 0), 0);
4294 shift
= INTVAL (XEXP (x
, 1));
4296 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4297 else if ((GET_CODE (x
) == SIGN_EXTRACT
4298 || GET_CODE (x
) == ZERO_EXTRACT
)
4299 && GET_MODE (x
) == DImode
4300 && GET_CODE (XEXP (x
, 0)) == MULT
4301 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4302 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
4304 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
4305 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4306 index
= XEXP (XEXP (x
, 0), 0);
4307 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
4308 if (INTVAL (XEXP (x
, 1)) != 32 + shift
4309 || INTVAL (XEXP (x
, 2)) != 0)
4312 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4313 (const_int 0xffffffff<<shift)) */
4314 else if (GET_CODE (x
) == AND
4315 && GET_MODE (x
) == DImode
4316 && GET_CODE (XEXP (x
, 0)) == MULT
4317 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4318 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4319 && CONST_INT_P (XEXP (x
, 1)))
4321 type
= ADDRESS_REG_UXTW
;
4322 index
= XEXP (XEXP (x
, 0), 0);
4323 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
4324 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
4327 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4328 else if ((GET_CODE (x
) == SIGN_EXTRACT
4329 || GET_CODE (x
) == ZERO_EXTRACT
)
4330 && GET_MODE (x
) == DImode
4331 && GET_CODE (XEXP (x
, 0)) == ASHIFT
4332 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4333 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
4335 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
4336 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
4337 index
= XEXP (XEXP (x
, 0), 0);
4338 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
4339 if (INTVAL (XEXP (x
, 1)) != 32 + shift
4340 || INTVAL (XEXP (x
, 2)) != 0)
4343 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4344 (const_int 0xffffffff<<shift)) */
4345 else if (GET_CODE (x
) == AND
4346 && GET_MODE (x
) == DImode
4347 && GET_CODE (XEXP (x
, 0)) == ASHIFT
4348 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
4349 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
4350 && CONST_INT_P (XEXP (x
, 1)))
4352 type
= ADDRESS_REG_UXTW
;
4353 index
= XEXP (XEXP (x
, 0), 0);
4354 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
4355 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
4358 /* (mult:P (reg:P) (const_int scale)) */
4359 else if (GET_CODE (x
) == MULT
4360 && GET_MODE (x
) == Pmode
4361 && GET_MODE (XEXP (x
, 0)) == Pmode
4362 && CONST_INT_P (XEXP (x
, 1)))
4364 type
= ADDRESS_REG_REG
;
4365 index
= XEXP (x
, 0);
4366 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
4368 /* (ashift:P (reg:P) (const_int shift)) */
4369 else if (GET_CODE (x
) == ASHIFT
4370 && GET_MODE (x
) == Pmode
4371 && GET_MODE (XEXP (x
, 0)) == Pmode
4372 && CONST_INT_P (XEXP (x
, 1)))
4374 type
= ADDRESS_REG_REG
;
4375 index
= XEXP (x
, 0);
4376 shift
= INTVAL (XEXP (x
, 1));
4382 && GET_CODE (index
) == SUBREG
4383 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
4384 index
= SUBREG_REG (index
);
4387 (shift
> 0 && shift
<= 3
4388 && (1 << shift
) == GET_MODE_SIZE (mode
)))
4390 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
4393 info
->offset
= index
;
4394 info
->shift
= shift
;
4401 /* Return true if MODE is one of the modes for which we
4402 support LDP/STP operations. */
4405 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
4407 return mode
== SImode
|| mode
== DImode
4408 || mode
== SFmode
|| mode
== DFmode
4409 || (aarch64_vector_mode_supported_p (mode
)
4410 && GET_MODE_SIZE (mode
) == 8);
4413 /* Return true if REGNO is a virtual pointer register, or an eliminable
4414 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4415 include stack_pointer or hard_frame_pointer. */
4417 virt_or_elim_regno_p (unsigned regno
)
4419 return ((regno
>= FIRST_VIRTUAL_REGISTER
4420 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
4421 || regno
== FRAME_POINTER_REGNUM
4422 || regno
== ARG_POINTER_REGNUM
);
4425 /* Return true if X is a valid address for machine mode MODE. If it is,
4426 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4427 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4430 aarch64_classify_address (struct aarch64_address_info
*info
,
4431 rtx x
, machine_mode mode
,
4432 RTX_CODE outer_code
, bool strict_p
)
4434 enum rtx_code code
= GET_CODE (x
);
4437 /* On BE, we use load/store pair for all large int mode load/stores.
4438 TI/TFmode may also use a load/store pair. */
4439 bool load_store_pair_p
= (outer_code
== PARALLEL
4442 || (BYTES_BIG_ENDIAN
4443 && aarch64_vect_struct_mode_p (mode
)));
4445 bool allow_reg_index_p
=
4447 && (GET_MODE_SIZE (mode
) != 16 || aarch64_vector_mode_supported_p (mode
))
4448 && !aarch64_vect_struct_mode_p (mode
);
4450 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4452 if (aarch64_vect_struct_mode_p (mode
) && !BYTES_BIG_ENDIAN
4453 && (code
!= POST_INC
&& code
!= REG
))
4460 info
->type
= ADDRESS_REG_IMM
;
4462 info
->offset
= const0_rtx
;
4463 return aarch64_base_register_rtx_p (x
, strict_p
);
4471 && virt_or_elim_regno_p (REGNO (op0
))
4472 && CONST_INT_P (op1
))
4474 info
->type
= ADDRESS_REG_IMM
;
4481 if (GET_MODE_SIZE (mode
) != 0
4482 && CONST_INT_P (op1
)
4483 && aarch64_base_register_rtx_p (op0
, strict_p
))
4485 HOST_WIDE_INT offset
= INTVAL (op1
);
4487 info
->type
= ADDRESS_REG_IMM
;
4491 /* TImode and TFmode values are allowed in both pairs of X
4492 registers and individual Q registers. The available
4494 X,X: 7-bit signed scaled offset
4495 Q: 9-bit signed offset
4496 We conservatively require an offset representable in either mode.
4497 When performing the check for pairs of X registers i.e. LDP/STP
4498 pass down DImode since that is the natural size of the LDP/STP
4499 instruction memory accesses. */
4500 if (mode
== TImode
|| mode
== TFmode
)
4501 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
4502 && (offset_9bit_signed_unscaled_p (mode
, offset
)
4503 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
4505 /* A 7bit offset check because OImode will emit a ldp/stp
4506 instruction (only big endian will get here).
4507 For ldp/stp instructions, the offset is scaled for the size of a
4508 single element of the pair. */
4510 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
4512 /* Three 9/12 bit offsets checks because CImode will emit three
4513 ldr/str instructions (only big endian will get here). */
4515 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4516 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
4517 || offset_12bit_unsigned_scaled_p (V16QImode
,
4520 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4521 instructions (only big endian will get here). */
4523 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
4524 && aarch64_offset_7bit_signed_scaled_p (TImode
,
4527 if (load_store_pair_p
)
4528 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4529 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4531 return (offset_9bit_signed_unscaled_p (mode
, offset
)
4532 || offset_12bit_unsigned_scaled_p (mode
, offset
));
4535 if (allow_reg_index_p
)
4537 /* Look for base + (scaled/extended) index register. */
4538 if (aarch64_base_register_rtx_p (op0
, strict_p
)
4539 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
4544 if (aarch64_base_register_rtx_p (op1
, strict_p
)
4545 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
4558 info
->type
= ADDRESS_REG_WB
;
4559 info
->base
= XEXP (x
, 0);
4560 info
->offset
= NULL_RTX
;
4561 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
4565 info
->type
= ADDRESS_REG_WB
;
4566 info
->base
= XEXP (x
, 0);
4567 if (GET_CODE (XEXP (x
, 1)) == PLUS
4568 && CONST_INT_P (XEXP (XEXP (x
, 1), 1))
4569 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
4570 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4572 HOST_WIDE_INT offset
;
4573 info
->offset
= XEXP (XEXP (x
, 1), 1);
4574 offset
= INTVAL (info
->offset
);
4576 /* TImode and TFmode values are allowed in both pairs of X
4577 registers and individual Q registers. The available
4579 X,X: 7-bit signed scaled offset
4580 Q: 9-bit signed offset
4581 We conservatively require an offset representable in either mode.
4583 if (mode
== TImode
|| mode
== TFmode
)
4584 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
4585 && offset_9bit_signed_unscaled_p (mode
, offset
));
4587 if (load_store_pair_p
)
4588 return ((GET_MODE_SIZE (mode
) == 4 || GET_MODE_SIZE (mode
) == 8)
4589 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
4591 return offset_9bit_signed_unscaled_p (mode
, offset
);
4598 /* load literal: pc-relative constant pool entry. Only supported
4599 for SI mode or larger. */
4600 info
->type
= ADDRESS_SYMBOLIC
;
4602 if (!load_store_pair_p
&& GET_MODE_SIZE (mode
) >= 4)
4606 split_const (x
, &sym
, &addend
);
4607 return ((GET_CODE (sym
) == LABEL_REF
4608 || (GET_CODE (sym
) == SYMBOL_REF
4609 && CONSTANT_POOL_ADDRESS_P (sym
)
4610 && aarch64_pcrelative_literal_loads
)));
4615 info
->type
= ADDRESS_LO_SUM
;
4616 info
->base
= XEXP (x
, 0);
4617 info
->offset
= XEXP (x
, 1);
4618 if (allow_reg_index_p
4619 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
4622 split_const (info
->offset
, &sym
, &offs
);
4623 if (GET_CODE (sym
) == SYMBOL_REF
4624 && (aarch64_classify_symbol (sym
, offs
) == SYMBOL_SMALL_ABSOLUTE
))
4626 /* The symbol and offset must be aligned to the access size. */
4628 unsigned int ref_size
;
4630 if (CONSTANT_POOL_ADDRESS_P (sym
))
4631 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
4632 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
4634 tree exp
= SYMBOL_REF_DECL (sym
);
4635 align
= TYPE_ALIGN (TREE_TYPE (exp
));
4636 align
= aarch64_constant_alignment (exp
, align
);
4638 else if (SYMBOL_REF_DECL (sym
))
4639 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
4640 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
4641 && SYMBOL_REF_BLOCK (sym
) != NULL
)
4642 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
4644 align
= BITS_PER_UNIT
;
4646 ref_size
= GET_MODE_SIZE (mode
);
4648 ref_size
= GET_MODE_SIZE (DImode
);
4650 return ((INTVAL (offs
) & (ref_size
- 1)) == 0
4651 && ((align
/ BITS_PER_UNIT
) & (ref_size
- 1)) == 0);
4661 /* Return true if the address X is valid for a PRFM instruction.
4662 STRICT_P is true if we should do strict checking with
4663 aarch64_classify_address. */
4666 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
4668 struct aarch64_address_info addr
;
4670 /* PRFM accepts the same addresses as DImode... */
4671 bool res
= aarch64_classify_address (&addr
, x
, DImode
, MEM
, strict_p
);
4675 /* ... except writeback forms. */
4676 return addr
.type
!= ADDRESS_REG_WB
;
4680 aarch64_symbolic_address_p (rtx x
)
4684 split_const (x
, &x
, &offset
);
4685 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
4688 /* Classify the base of symbolic expression X. */
4690 enum aarch64_symbol_type
4691 aarch64_classify_symbolic_expression (rtx x
)
4695 split_const (x
, &x
, &offset
);
4696 return aarch64_classify_symbol (x
, offset
);
4700 /* Return TRUE if X is a legitimate address for accessing memory in
4703 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
4705 struct aarch64_address_info addr
;
4707 return aarch64_classify_address (&addr
, x
, mode
, MEM
, strict_p
);
4710 /* Return TRUE if X is a legitimate address for accessing memory in
4711 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4714 aarch64_legitimate_address_p (machine_mode mode
, rtx x
,
4715 RTX_CODE outer_code
, bool strict_p
)
4717 struct aarch64_address_info addr
;
4719 return aarch64_classify_address (&addr
, x
, mode
, outer_code
, strict_p
);
4722 /* Split an out-of-range address displacement into a base and offset.
4723 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4724 to increase opportunities for sharing the base address of different sizes.
4725 Unaligned accesses use the signed 9-bit range, TImode/TFmode use
4726 the intersection of signed scaled 7-bit and signed 9-bit offset. */
4728 aarch64_legitimize_address_displacement (rtx
*disp
, rtx
*off
, machine_mode mode
)
4730 HOST_WIDE_INT offset
= INTVAL (*disp
);
4733 if (mode
== TImode
|| mode
== TFmode
)
4734 base
= (offset
+ 0x100) & ~0x1f8;
4735 else if ((offset
& (GET_MODE_SIZE (mode
) - 1)) != 0)
4736 base
= (offset
+ 0x100) & ~0x1ff;
4738 base
= offset
& ~(GET_MODE_SIZE (mode
) < 4 ? 0xfff : 0x3ffc);
4740 *off
= GEN_INT (base
);
4741 *disp
= GEN_INT (offset
- base
);
4745 /* Return the binary representation of floating point constant VALUE in INTVAL.
4746 If the value cannot be converted, return false without setting INTVAL.
4747 The conversion is done in the given MODE. */
4749 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
4752 /* We make a general exception for 0. */
4753 if (aarch64_float_const_zero_rtx_p (value
))
4759 machine_mode mode
= GET_MODE (value
);
4760 if (GET_CODE (value
) != CONST_DOUBLE
4761 || !SCALAR_FLOAT_MODE_P (mode
)
4762 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
4763 /* Only support up to DF mode. */
4764 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
4767 unsigned HOST_WIDE_INT ival
= 0;
4770 real_to_target (res
,
4771 CONST_DOUBLE_REAL_VALUE (value
),
4772 REAL_MODE_FORMAT (mode
));
4776 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
4777 ival
= zext_hwi (res
[order
], 32);
4778 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
4781 ival
= zext_hwi (res
[0], 32);
4787 /* Return TRUE if rtx X is an immediate constant that can be moved using a
4788 single MOV(+MOVK) followed by an FMOV. */
4790 aarch64_float_const_rtx_p (rtx x
)
4792 machine_mode mode
= GET_MODE (x
);
4793 if (mode
== VOIDmode
)
4796 /* Determine whether it's cheaper to write float constants as
4797 mov/movk pairs over ldr/adrp pairs. */
4798 unsigned HOST_WIDE_INT ival
;
4800 if (GET_CODE (x
) == CONST_DOUBLE
4801 && SCALAR_FLOAT_MODE_P (mode
)
4802 && aarch64_reinterpret_float_as_int (x
, &ival
))
4804 scalar_int_mode imode
= (mode
== HFmode
4806 : int_mode_for_mode (mode
).require ());
4807 int num_instr
= aarch64_internal_mov_immediate
4808 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
4809 return num_instr
< 3;
4815 /* Return TRUE if rtx X is immediate constant 0.0 */
4817 aarch64_float_const_zero_rtx_p (rtx x
)
4819 if (GET_MODE (x
) == VOIDmode
)
4822 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
4823 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
4824 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
4827 /* Return TRUE if rtx X is immediate constant that fits in a single
4828 MOVI immediate operation. */
4830 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
4836 scalar_int_mode imode
;
4837 unsigned HOST_WIDE_INT ival
;
4839 if (GET_CODE (x
) == CONST_DOUBLE
4840 && SCALAR_FLOAT_MODE_P (mode
))
4842 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
4845 /* We make a general exception for 0. */
4846 if (aarch64_float_const_zero_rtx_p (x
))
4849 imode
= int_mode_for_mode (mode
).require ();
4851 else if (GET_CODE (x
) == CONST_INT
4852 && is_a
<scalar_int_mode
> (mode
, &imode
))
4857 /* use a 64 bit mode for everything except for DI/DF mode, where we use
4858 a 128 bit vector mode. */
4859 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
4861 vmode
= aarch64_simd_container_mode (imode
, width
);
4862 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
4864 return aarch64_simd_valid_immediate (v_op
, vmode
, false, NULL
);
4868 /* Return the fixed registers used for condition codes. */
4871 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
4874 *p2
= INVALID_REGNUM
;
4878 /* This function is used by the call expanders of the machine description.
4879 RESULT is the register in which the result is returned. It's NULL for
4880 "call" and "sibcall".
4881 MEM is the location of the function call.
4882 SIBCALL indicates whether this function call is normal call or sibling call.
4883 It will generate different pattern accordingly. */
4886 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
4888 rtx call
, callee
, tmp
;
4892 gcc_assert (MEM_P (mem
));
4893 callee
= XEXP (mem
, 0);
4894 mode
= GET_MODE (callee
);
4895 gcc_assert (mode
== Pmode
);
4897 /* Decide if we should generate indirect calls by loading the
4898 address of the callee into a register before performing
4899 the branch-and-link. */
4900 if (SYMBOL_REF_P (callee
)
4901 ? (aarch64_is_long_call_p (callee
)
4902 || aarch64_is_noplt_call_p (callee
))
4904 XEXP (mem
, 0) = force_reg (mode
, callee
);
4906 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
4908 if (result
!= NULL_RTX
)
4909 call
= gen_rtx_SET (result
, call
);
4914 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
4916 vec
= gen_rtvec (2, call
, tmp
);
4917 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
4919 aarch64_emit_call_insn (call
);
4922 /* Emit call insn with PAT and do aarch64-specific handling. */
4925 aarch64_emit_call_insn (rtx pat
)
4927 rtx insn
= emit_call_insn (pat
);
4929 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
4930 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
4931 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
4935 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
4937 /* All floating point compares return CCFP if it is an equality
4938 comparison, and CCFPE otherwise. */
4939 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
4966 /* Equality comparisons of short modes against zero can be performed
4967 using the TST instruction with the appropriate bitmask. */
4968 if (y
== const0_rtx
&& REG_P (x
)
4969 && (code
== EQ
|| code
== NE
)
4970 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
4973 /* Similarly, comparisons of zero_extends from shorter modes can
4974 be performed using an ANDS with an immediate mask. */
4975 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
4976 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4977 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
4978 && (code
== EQ
|| code
== NE
))
4981 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4983 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
4984 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
4985 || GET_CODE (x
) == NEG
4986 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
4987 && CONST_INT_P (XEXP (x
, 2)))))
4990 /* A compare with a shifted operand. Because of canonicalization,
4991 the comparison will have to be swapped when we emit the assembly
4993 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
4994 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
4995 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
4996 || GET_CODE (x
) == LSHIFTRT
4997 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
5000 /* Similarly for a negated operand, but we can only do this for
5002 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
5003 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
5004 && (code
== EQ
|| code
== NE
)
5005 && GET_CODE (x
) == NEG
)
5008 /* A test for unsigned overflow. */
5009 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
5011 && GET_CODE (x
) == PLUS
5012 && GET_CODE (y
) == ZERO_EXTEND
)
5015 /* For everything else, return CCmode. */
5020 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
5023 aarch64_get_condition_code (rtx x
)
5025 machine_mode mode
= GET_MODE (XEXP (x
, 0));
5026 enum rtx_code comp_code
= GET_CODE (x
);
5028 if (GET_MODE_CLASS (mode
) != MODE_CC
)
5029 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
5030 return aarch64_get_condition_code_1 (mode
, comp_code
);
5034 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
5042 case GE
: return AARCH64_GE
;
5043 case GT
: return AARCH64_GT
;
5044 case LE
: return AARCH64_LS
;
5045 case LT
: return AARCH64_MI
;
5046 case NE
: return AARCH64_NE
;
5047 case EQ
: return AARCH64_EQ
;
5048 case ORDERED
: return AARCH64_VC
;
5049 case UNORDERED
: return AARCH64_VS
;
5050 case UNLT
: return AARCH64_LT
;
5051 case UNLE
: return AARCH64_LE
;
5052 case UNGT
: return AARCH64_HI
;
5053 case UNGE
: return AARCH64_PL
;
5061 case NE
: return AARCH64_NE
;
5062 case EQ
: return AARCH64_EQ
;
5063 case GE
: return AARCH64_GE
;
5064 case GT
: return AARCH64_GT
;
5065 case LE
: return AARCH64_LE
;
5066 case LT
: return AARCH64_LT
;
5067 case GEU
: return AARCH64_CS
;
5068 case GTU
: return AARCH64_HI
;
5069 case LEU
: return AARCH64_LS
;
5070 case LTU
: return AARCH64_CC
;
5078 case NE
: return AARCH64_NE
;
5079 case EQ
: return AARCH64_EQ
;
5080 case GE
: return AARCH64_LE
;
5081 case GT
: return AARCH64_LT
;
5082 case LE
: return AARCH64_GE
;
5083 case LT
: return AARCH64_GT
;
5084 case GEU
: return AARCH64_LS
;
5085 case GTU
: return AARCH64_CC
;
5086 case LEU
: return AARCH64_CS
;
5087 case LTU
: return AARCH64_HI
;
5095 case NE
: return AARCH64_NE
;
5096 case EQ
: return AARCH64_EQ
;
5097 case GE
: return AARCH64_PL
;
5098 case LT
: return AARCH64_MI
;
5106 case NE
: return AARCH64_NE
;
5107 case EQ
: return AARCH64_EQ
;
5115 case NE
: return AARCH64_CS
;
5116 case EQ
: return AARCH64_CC
;
5129 aarch64_const_vec_all_same_in_range_p (rtx x
,
5130 HOST_WIDE_INT minval
,
5131 HOST_WIDE_INT maxval
)
5133 HOST_WIDE_INT firstval
;
5136 if (GET_CODE (x
) != CONST_VECTOR
5137 || GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_INT
)
5140 firstval
= INTVAL (CONST_VECTOR_ELT (x
, 0));
5141 if (firstval
< minval
|| firstval
> maxval
)
5144 count
= CONST_VECTOR_NUNITS (x
);
5145 for (i
= 1; i
< count
; i
++)
5146 if (INTVAL (CONST_VECTOR_ELT (x
, i
)) != firstval
)
5153 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
5155 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
5160 #define AARCH64_CC_V 1
5161 #define AARCH64_CC_C (1 << 1)
5162 #define AARCH64_CC_Z (1 << 2)
5163 #define AARCH64_CC_N (1 << 3)
5165 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
5166 static const int aarch64_nzcv_codes
[] =
5168 0, /* EQ, Z == 1. */
5169 AARCH64_CC_Z
, /* NE, Z == 0. */
5170 0, /* CS, C == 1. */
5171 AARCH64_CC_C
, /* CC, C == 0. */
5172 0, /* MI, N == 1. */
5173 AARCH64_CC_N
, /* PL, N == 0. */
5174 0, /* VS, V == 1. */
5175 AARCH64_CC_V
, /* VC, V == 0. */
5176 0, /* HI, C ==1 && Z == 0. */
5177 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
5178 AARCH64_CC_V
, /* GE, N == V. */
5179 0, /* LT, N != V. */
5180 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
5181 0, /* LE, !(Z == 0 && N == V). */
5186 /* Print operand X to file F in a target specific manner according to CODE.
5187 The acceptable formatting commands given by CODE are:
5188 'c': An integer or symbol address without a preceding #
5190 'e': Print the sign/zero-extend size as a character 8->b,
5192 'p': Prints N such that 2^N == X (X must be power of 2 and
5194 'P': Print the number of non-zero bits in X (a const_int).
5195 'H': Print the higher numbered register of a pair (TImode)
5197 'm': Print a condition (eq, ne, etc).
5198 'M': Same as 'm', but invert condition.
5199 'b/h/s/d/q': Print a scalar FP/SIMD register name.
5200 'S/T/U/V': Print a FP/SIMD register name for a register list.
5201 The register printed is the FP/SIMD register name
5202 of X + 0/1/2/3 for S/T/U/V.
5203 'R': Print a scalar FP/SIMD register name + 1.
5204 'X': Print bottom 16 bits of integer constant in hex.
5205 'w/x': Print a general register name or the zero register
5207 '0': Print a normal operand, if it's a general register,
5208 then we assume DImode.
5209 'k': Print NZCV for conditional compare instructions.
5210 'A': Output address constant representing the first
5211 argument of X, specifying a relocation offset
5213 'L': Output constant address specified by X
5214 with a relocation offset if appropriate.
5215 'G': Prints address of X, specifying a PC relative
5216 relocation mode if appropriate. */
5219 aarch64_print_operand (FILE *f
, rtx x
, int code
)
5224 switch (GET_CODE (x
))
5227 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
5231 output_addr_const (f
, x
);
5235 if (GET_CODE (XEXP (x
, 0)) == PLUS
5236 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
5238 output_addr_const (f
, x
);
5244 output_operand_lossage ("Unsupported operand for code '%c'", code
);
5252 if (!CONST_INT_P (x
)
5253 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
5255 output_operand_lossage ("invalid operand for '%%%c'", code
);
5271 output_operand_lossage ("invalid operand for '%%%c'", code
);
5281 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
5283 output_operand_lossage ("invalid operand for '%%%c'", code
);
5287 asm_fprintf (f
, "%d", n
);
5292 if (!CONST_INT_P (x
))
5294 output_operand_lossage ("invalid operand for '%%%c'", code
);
5298 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
5302 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
5304 output_operand_lossage ("invalid operand for '%%%c'", code
);
5308 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
5315 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5316 if (x
== const_true_rtx
)
5323 if (!COMPARISON_P (x
))
5325 output_operand_lossage ("invalid operand for '%%%c'", code
);
5329 cond_code
= aarch64_get_condition_code (x
);
5330 gcc_assert (cond_code
>= 0);
5332 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
5333 fputs (aarch64_condition_codes
[cond_code
], f
);
5342 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5344 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5347 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
5354 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5356 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5359 asm_fprintf (f
, "v%d", REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
5363 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
5365 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
5368 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
5372 if (!CONST_INT_P (x
))
5374 output_operand_lossage ("invalid operand for '%%%c'", code
);
5377 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
5383 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
5385 asm_fprintf (f
, "%czr", code
);
5389 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
5391 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
5395 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
5397 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
5406 output_operand_lossage ("missing operand");
5410 switch (GET_CODE (x
))
5413 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
5417 output_address (GET_MODE (x
), XEXP (x
, 0));
5418 /* Check all memory references are Pmode - even with ILP32. */
5419 gcc_assert (GET_MODE (XEXP (x
, 0)) == Pmode
);
5425 output_addr_const (asm_out_file
, x
);
5429 asm_fprintf (f
, "%wd", INTVAL (x
));
5433 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
5436 aarch64_const_vec_all_same_in_range_p (x
,
5438 HOST_WIDE_INT_MAX
));
5439 asm_fprintf (f
, "%wd", INTVAL (CONST_VECTOR_ELT (x
, 0)));
5441 else if (aarch64_simd_imm_zero_p (x
, GET_MODE (x
)))
5450 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5451 be getting CONST_DOUBLEs holding integers. */
5452 gcc_assert (GET_MODE (x
) != VOIDmode
);
5453 if (aarch64_float_const_zero_rtx_p (x
))
5458 else if (aarch64_float_const_representable_p (x
))
5461 char float_buf
[buf_size
] = {'\0'};
5462 real_to_decimal_for_mode (float_buf
,
5463 CONST_DOUBLE_REAL_VALUE (x
),
5466 asm_fprintf (asm_out_file
, "%s", float_buf
);
5470 output_operand_lossage ("invalid constant");
5473 output_operand_lossage ("invalid operand");
5479 if (GET_CODE (x
) == HIGH
)
5482 switch (aarch64_classify_symbolic_expression (x
))
5484 case SYMBOL_SMALL_GOT_4G
:
5485 asm_fprintf (asm_out_file
, ":got:");
5488 case SYMBOL_SMALL_TLSGD
:
5489 asm_fprintf (asm_out_file
, ":tlsgd:");
5492 case SYMBOL_SMALL_TLSDESC
:
5493 asm_fprintf (asm_out_file
, ":tlsdesc:");
5496 case SYMBOL_SMALL_TLSIE
:
5497 asm_fprintf (asm_out_file
, ":gottprel:");
5500 case SYMBOL_TLSLE24
:
5501 asm_fprintf (asm_out_file
, ":tprel:");
5504 case SYMBOL_TINY_GOT
:
5511 output_addr_const (asm_out_file
, x
);
5515 switch (aarch64_classify_symbolic_expression (x
))
5517 case SYMBOL_SMALL_GOT_4G
:
5518 asm_fprintf (asm_out_file
, ":lo12:");
5521 case SYMBOL_SMALL_TLSGD
:
5522 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
5525 case SYMBOL_SMALL_TLSDESC
:
5526 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
5529 case SYMBOL_SMALL_TLSIE
:
5530 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
5533 case SYMBOL_TLSLE12
:
5534 asm_fprintf (asm_out_file
, ":tprel_lo12:");
5537 case SYMBOL_TLSLE24
:
5538 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
5541 case SYMBOL_TINY_GOT
:
5542 asm_fprintf (asm_out_file
, ":got:");
5545 case SYMBOL_TINY_TLSIE
:
5546 asm_fprintf (asm_out_file
, ":gottprel:");
5552 output_addr_const (asm_out_file
, x
);
5556 switch (aarch64_classify_symbolic_expression (x
))
5558 case SYMBOL_TLSLE24
:
5559 asm_fprintf (asm_out_file
, ":tprel_hi12:");
5564 output_addr_const (asm_out_file
, x
);
5569 HOST_WIDE_INT cond_code
;
5571 if (!CONST_INT_P (x
))
5573 output_operand_lossage ("invalid operand for '%%%c'", code
);
5577 cond_code
= INTVAL (x
);
5578 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
5579 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
5584 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
5590 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
5592 struct aarch64_address_info addr
;
5594 if (aarch64_classify_address (&addr
, x
, mode
, MEM
, true))
5597 case ADDRESS_REG_IMM
:
5598 if (addr
.offset
== const0_rtx
)
5599 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
5601 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
5602 INTVAL (addr
.offset
));
5605 case ADDRESS_REG_REG
:
5606 if (addr
.shift
== 0)
5607 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
5608 reg_names
[REGNO (addr
.offset
)]);
5610 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
5611 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
5614 case ADDRESS_REG_UXTW
:
5615 if (addr
.shift
== 0)
5616 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
5617 REGNO (addr
.offset
) - R0_REGNUM
);
5619 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
5620 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5623 case ADDRESS_REG_SXTW
:
5624 if (addr
.shift
== 0)
5625 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
5626 REGNO (addr
.offset
) - R0_REGNUM
);
5628 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
5629 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
5632 case ADDRESS_REG_WB
:
5633 switch (GET_CODE (x
))
5636 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)],
5637 GET_MODE_SIZE (mode
));
5640 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)],
5641 GET_MODE_SIZE (mode
));
5644 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)],
5645 GET_MODE_SIZE (mode
));
5648 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)],
5649 GET_MODE_SIZE (mode
));
5652 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
5653 INTVAL (addr
.offset
));
5656 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
5657 INTVAL (addr
.offset
));
5664 case ADDRESS_LO_SUM
:
5665 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
5666 output_addr_const (f
, addr
.offset
);
5667 asm_fprintf (f
, "]");
5670 case ADDRESS_SYMBOLIC
:
5674 output_addr_const (f
, x
);
5678 aarch64_label_mentioned_p (rtx x
)
5683 if (GET_CODE (x
) == LABEL_REF
)
5686 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5687 referencing instruction, but they are constant offsets, not
5689 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5692 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
5693 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
5699 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
5700 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
5703 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
5710 /* Implement REGNO_REG_CLASS. */
5713 aarch64_regno_regclass (unsigned regno
)
5715 if (GP_REGNUM_P (regno
))
5716 return GENERAL_REGS
;
5718 if (regno
== SP_REGNUM
)
5721 if (regno
== FRAME_POINTER_REGNUM
5722 || regno
== ARG_POINTER_REGNUM
)
5723 return POINTER_REGS
;
5725 if (FP_REGNUM_P (regno
))
5726 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
5732 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
5734 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5735 where mask is selected by alignment and size of the offset.
5736 We try to pick as large a range for the offset as possible to
5737 maximize the chance of a CSE. However, for aligned addresses
5738 we limit the range to 4k so that structures with different sized
5739 elements are likely to use the same base. We need to be careful
5740 not to split a CONST for some forms of address expression, otherwise
5741 it will generate sub-optimal code. */
5743 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
5745 rtx base
= XEXP (x
, 0);
5746 rtx offset_rtx
= XEXP (x
, 1);
5747 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
5749 if (GET_CODE (base
) == PLUS
)
5751 rtx op0
= XEXP (base
, 0);
5752 rtx op1
= XEXP (base
, 1);
5754 /* Force any scaling into a temp for CSE. */
5755 op0
= force_reg (Pmode
, op0
);
5756 op1
= force_reg (Pmode
, op1
);
5758 /* Let the pointer register be in op0. */
5759 if (REG_POINTER (op1
))
5760 std::swap (op0
, op1
);
5762 /* If the pointer is virtual or frame related, then we know that
5763 virtual register instantiation or register elimination is going
5764 to apply a second constant. We want the two constants folded
5765 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5766 if (virt_or_elim_regno_p (REGNO (op0
)))
5768 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
5769 NULL_RTX
, true, OPTAB_DIRECT
);
5770 return gen_rtx_PLUS (Pmode
, base
, op1
);
5773 /* Otherwise, in order to encourage CSE (and thence loop strength
5774 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5775 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
5776 NULL_RTX
, true, OPTAB_DIRECT
);
5777 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
5780 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5781 HOST_WIDE_INT base_offset
;
5782 if (GET_MODE_SIZE (mode
) > 16)
5783 base_offset
= (offset
+ 0x400) & ~0x7f0;
5784 /* For offsets aren't a multiple of the access size, the limit is
5786 else if (offset
& (GET_MODE_SIZE (mode
) - 1))
5788 base_offset
= (offset
+ 0x100) & ~0x1ff;
5790 /* BLKmode typically uses LDP of X-registers. */
5791 if (mode
== BLKmode
)
5792 base_offset
= (offset
+ 512) & ~0x3ff;
5794 /* Small negative offsets are supported. */
5795 else if (IN_RANGE (offset
, -256, 0))
5797 else if (mode
== TImode
|| mode
== TFmode
)
5798 base_offset
= (offset
+ 0x100) & ~0x1ff;
5799 /* Use 12-bit offset by access size. */
5801 base_offset
= offset
& (~0xfff * GET_MODE_SIZE (mode
));
5803 if (base_offset
!= 0)
5805 base
= plus_constant (Pmode
, base
, base_offset
);
5806 base
= force_operand (base
, NULL_RTX
);
5807 return plus_constant (Pmode
, base
, offset
- base_offset
);
5814 /* Return the reload icode required for a constant pool in mode. */
5815 static enum insn_code
5816 aarch64_constant_pool_reload_icode (machine_mode mode
)
5821 return CODE_FOR_aarch64_reload_movcpsfdi
;
5824 return CODE_FOR_aarch64_reload_movcpdfdi
;
5827 return CODE_FOR_aarch64_reload_movcptfdi
;
5830 return CODE_FOR_aarch64_reload_movcpv8qidi
;
5833 return CODE_FOR_aarch64_reload_movcpv16qidi
;
5836 return CODE_FOR_aarch64_reload_movcpv4hidi
;
5839 return CODE_FOR_aarch64_reload_movcpv8hidi
;
5842 return CODE_FOR_aarch64_reload_movcpv2sidi
;
5845 return CODE_FOR_aarch64_reload_movcpv4sidi
;
5848 return CODE_FOR_aarch64_reload_movcpv2didi
;
5851 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
5860 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
5863 secondary_reload_info
*sri
)
5866 /* If we have to disable direct literal pool loads and stores because the
5867 function is too big, then we need a scratch register. */
5868 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
5869 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
5870 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
5871 && !aarch64_pcrelative_literal_loads
)
5873 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
5877 /* Without the TARGET_SIMD instructions we cannot move a Q register
5878 to a Q register directly. We need a scratch. */
5879 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
5880 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
5881 && reg_class_subset_p (rclass
, FP_REGS
))
5884 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
5885 else if (mode
== TImode
)
5886 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
5890 /* A TFmode or TImode memory access should be handled via an FP_REGS
5891 because AArch64 has richer addressing modes for LDR/STR instructions
5892 than LDP/STP instructions. */
5893 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
5894 && GET_MODE_SIZE (mode
) == 16 && MEM_P (x
))
5897 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
5898 return GENERAL_REGS
;
5904 aarch64_can_eliminate (const int from
, const int to
)
5906 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5907 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5909 if (frame_pointer_needed
)
5911 if (from
== ARG_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5913 if (from
== ARG_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
)
5915 if (from
== FRAME_POINTER_REGNUM
&& to
== STACK_POINTER_REGNUM
5916 && !cfun
->calls_alloca
)
5918 if (from
== FRAME_POINTER_REGNUM
&& to
== HARD_FRAME_POINTER_REGNUM
)
5925 /* If we decided that we didn't need a leaf frame pointer but then used
5926 LR in the function, then we'll want a frame pointer after all, so
5927 prevent this elimination to ensure a frame pointer is used. */
5928 if (to
== STACK_POINTER_REGNUM
5929 && flag_omit_frame_pointer
== 2
5930 && flag_omit_leaf_frame_pointer
5931 && df_regs_ever_live_p (LR_REGNUM
))
5939 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
5941 aarch64_layout_frame ();
5943 if (to
== HARD_FRAME_POINTER_REGNUM
)
5945 if (from
== ARG_POINTER_REGNUM
)
5946 return cfun
->machine
->frame
.hard_fp_offset
;
5948 if (from
== FRAME_POINTER_REGNUM
)
5949 return cfun
->machine
->frame
.hard_fp_offset
5950 - cfun
->machine
->frame
.locals_offset
;
5953 if (to
== STACK_POINTER_REGNUM
)
5955 if (from
== FRAME_POINTER_REGNUM
)
5956 return cfun
->machine
->frame
.frame_size
5957 - cfun
->machine
->frame
.locals_offset
;
5960 return cfun
->machine
->frame
.frame_size
;
5963 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5967 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
5971 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
5976 aarch64_asm_trampoline_template (FILE *f
)
5980 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
5981 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
5985 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
5986 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
5988 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
5989 assemble_aligned_integer (4, const0_rtx
);
5990 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5991 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
5995 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
5997 rtx fnaddr
, mem
, a_tramp
;
5998 const int tramp_code_sz
= 16;
6000 /* Don't need to copy the trailing D-words, we fill those in below. */
6001 emit_block_move (m_tramp
, assemble_trampoline_template (),
6002 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
6003 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
6004 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
6005 if (GET_MODE (fnaddr
) != ptr_mode
)
6006 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
6007 emit_move_insn (mem
, fnaddr
);
6009 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
6010 emit_move_insn (mem
, chain_value
);
6012 /* XXX We should really define a "clear_cache" pattern and use
6013 gen_clear_cache(). */
6014 a_tramp
= XEXP (m_tramp
, 0);
6015 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
6016 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
6017 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
6021 static unsigned char
6022 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
6026 case CALLER_SAVE_REGS
:
6030 case POINTER_AND_FP_REGS
:
6034 aarch64_vector_mode_p (mode
)
6035 ? (GET_MODE_SIZE (mode
) + UNITS_PER_VREG
- 1) / UNITS_PER_VREG
6036 : (GET_MODE_SIZE (mode
) + UNITS_PER_WORD
- 1) / UNITS_PER_WORD
;
6050 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
6052 if (regclass
== POINTER_REGS
)
6053 return GENERAL_REGS
;
6055 if (regclass
== STACK_REG
)
6058 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
6064 /* Register eliminiation can result in a request for
6065 SP+constant->FP_REGS. We cannot support such operations which
6066 use SP as source and an FP_REG as destination, so reject out
6068 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
6070 rtx lhs
= XEXP (x
, 0);
6072 /* Look through a possible SUBREG introduced by ILP32. */
6073 if (GET_CODE (lhs
) == SUBREG
)
6074 lhs
= SUBREG_REG (lhs
);
6076 gcc_assert (REG_P (lhs
));
6077 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
6086 aarch64_asm_output_labelref (FILE* f
, const char *name
)
6088 asm_fprintf (f
, "%U%s", name
);
6092 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
6094 if (priority
== DEFAULT_INIT_PRIORITY
)
6095 default_ctor_section_asm_out_constructor (symbol
, priority
);
6099 /* While priority is known to be in range [0, 65535], so 18 bytes
6100 would be enough, the compiler might not know that. To avoid
6101 -Wformat-truncation false positive, use a larger size. */
6103 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
6104 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
6105 switch_to_section (s
);
6106 assemble_align (POINTER_SIZE
);
6107 assemble_aligned_integer (POINTER_BYTES
, symbol
);
6112 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
6114 if (priority
== DEFAULT_INIT_PRIORITY
)
6115 default_dtor_section_asm_out_destructor (symbol
, priority
);
6119 /* While priority is known to be in range [0, 65535], so 18 bytes
6120 would be enough, the compiler might not know that. To avoid
6121 -Wformat-truncation false positive, use a larger size. */
6123 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
6124 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
6125 switch_to_section (s
);
6126 assemble_align (POINTER_SIZE
);
6127 assemble_aligned_integer (POINTER_BYTES
, symbol
);
6132 aarch64_output_casesi (rtx
*operands
)
6136 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
6138 static const char *const patterns
[4][2] =
6141 "ldrb\t%w3, [%0,%w1,uxtw]",
6142 "add\t%3, %4, %w3, sxtb #2"
6145 "ldrh\t%w3, [%0,%w1,uxtw #1]",
6146 "add\t%3, %4, %w3, sxth #2"
6149 "ldr\t%w3, [%0,%w1,uxtw #2]",
6150 "add\t%3, %4, %w3, sxtw #2"
6152 /* We assume that DImode is only generated when not optimizing and
6153 that we don't really need 64-bit address offsets. That would
6154 imply an object file with 8GB of code in a single function! */
6156 "ldr\t%w3, [%0,%w1,uxtw #2]",
6157 "add\t%3, %4, %w3, sxtw #2"
6161 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
6163 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
6164 index
= exact_log2 (GET_MODE_SIZE (mode
));
6166 gcc_assert (index
>= 0 && index
<= 3);
6168 /* Need to implement table size reduction, by chaning the code below. */
6169 output_asm_insn (patterns
[index
][0], operands
);
6170 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
6171 snprintf (buf
, sizeof (buf
),
6172 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
6173 output_asm_insn (buf
, operands
);
6174 output_asm_insn (patterns
[index
][1], operands
);
6175 output_asm_insn ("br\t%3", operands
);
6176 assemble_label (asm_out_file
, label
);
6181 /* Return size in bits of an arithmetic operand which is shifted/scaled and
6182 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
6186 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
6188 if (shift
>= 0 && shift
<= 3)
6191 for (size
= 8; size
<= 32; size
*= 2)
6193 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
6194 if (mask
== bits
<< shift
)
6201 /* Constant pools are per function only when PC relative
6202 literal loads are true or we are in the large memory
6206 aarch64_can_use_per_function_literal_pools_p (void)
6208 return (aarch64_pcrelative_literal_loads
6209 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
6213 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
6215 /* Fixme:: In an ideal world this would work similar
6216 to the logic in aarch64_select_rtx_section but this
6217 breaks bootstrap in gcc go. For now we workaround
6218 this by returning false here. */
6222 /* Select appropriate section for constants depending
6223 on where we place literal pools. */
6226 aarch64_select_rtx_section (machine_mode mode
,
6228 unsigned HOST_WIDE_INT align
)
6230 if (aarch64_can_use_per_function_literal_pools_p ())
6231 return function_section (current_function_decl
);
6233 return default_elf_select_rtx_section (mode
, x
, align
);
6236 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
6238 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
6239 HOST_WIDE_INT offset
)
6241 /* When using per-function literal pools, we must ensure that any code
6242 section is aligned to the minimal instruction length, lest we get
6243 errors from the assembler re "unaligned instructions". */
6244 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
6245 ASM_OUTPUT_ALIGN (f
, 2);
6250 /* Helper function for rtx cost calculation. Strip a shift expression
6251 from X. Returns the inner operand if successful, or the original
6252 expression on failure. */
6254 aarch64_strip_shift (rtx x
)
6258 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6259 we can convert both to ROR during final output. */
6260 if ((GET_CODE (op
) == ASHIFT
6261 || GET_CODE (op
) == ASHIFTRT
6262 || GET_CODE (op
) == LSHIFTRT
6263 || GET_CODE (op
) == ROTATERT
6264 || GET_CODE (op
) == ROTATE
)
6265 && CONST_INT_P (XEXP (op
, 1)))
6266 return XEXP (op
, 0);
6268 if (GET_CODE (op
) == MULT
6269 && CONST_INT_P (XEXP (op
, 1))
6270 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
6271 return XEXP (op
, 0);
6276 /* Helper function for rtx cost calculation. Strip an extend
6277 expression from X. Returns the inner operand if successful, or the
6278 original expression on failure. We deal with a number of possible
6279 canonicalization variations here. If STRIP_SHIFT is true, then
6280 we can strip off a shift also. */
6282 aarch64_strip_extend (rtx x
, bool strip_shift
)
6284 scalar_int_mode mode
;
6287 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
6290 /* Zero and sign extraction of a widened value. */
6291 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
6292 && XEXP (op
, 2) == const0_rtx
6293 && GET_CODE (XEXP (op
, 0)) == MULT
6294 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
6296 return XEXP (XEXP (op
, 0), 0);
6298 /* It can also be represented (for zero-extend) as an AND with an
6300 if (GET_CODE (op
) == AND
6301 && GET_CODE (XEXP (op
, 0)) == MULT
6302 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
6303 && CONST_INT_P (XEXP (op
, 1))
6304 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
6305 INTVAL (XEXP (op
, 1))) != 0)
6306 return XEXP (XEXP (op
, 0), 0);
6308 /* Now handle extended register, as this may also have an optional
6309 left shift by 1..4. */
6311 && GET_CODE (op
) == ASHIFT
6312 && CONST_INT_P (XEXP (op
, 1))
6313 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
6316 if (GET_CODE (op
) == ZERO_EXTEND
6317 || GET_CODE (op
) == SIGN_EXTEND
)
6326 /* Return true iff CODE is a shift supported in combination
6327 with arithmetic instructions. */
6330 aarch64_shift_p (enum rtx_code code
)
6332 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
6336 /* Return true iff X is a cheap shift without a sign extend. */
6339 aarch64_cheap_mult_shift_p (rtx x
)
6346 if (!(aarch64_tune_params
.extra_tuning_flags
6347 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
6350 if (GET_CODE (op0
) == SIGN_EXTEND
)
6353 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
6354 && UINTVAL (op1
) <= 4)
6357 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
6360 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
6362 if (l2
> 0 && l2
<= 4)
6368 /* Helper function for rtx cost calculation. Calculate the cost of
6369 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6370 Return the calculated cost of the expression, recursing manually in to
6371 operands where needed. */
6374 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
6377 const struct cpu_cost_table
*extra_cost
6378 = aarch64_tune_params
.insn_extra_cost
;
6380 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
6381 machine_mode mode
= GET_MODE (x
);
6383 gcc_checking_assert (code
== MULT
);
6388 if (VECTOR_MODE_P (mode
))
6389 mode
= GET_MODE_INNER (mode
);
6391 /* Integer multiply/fma. */
6392 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6394 /* The multiply will be canonicalized as a shift, cost it as such. */
6395 if (aarch64_shift_p (GET_CODE (x
))
6396 || (CONST_INT_P (op1
)
6397 && exact_log2 (INTVAL (op1
)) > 0))
6399 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
6400 || GET_CODE (op0
) == SIGN_EXTEND
;
6405 /* If the shift is considered cheap,
6406 then don't add any cost. */
6407 if (aarch64_cheap_mult_shift_p (x
))
6409 else if (REG_P (op1
))
6410 /* ARITH + shift-by-register. */
6411 cost
+= extra_cost
->alu
.arith_shift_reg
;
6413 /* ARITH + extended register. We don't have a cost field
6414 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6415 cost
+= extra_cost
->alu
.extend_arith
;
6417 /* ARITH + shift-by-immediate. */
6418 cost
+= extra_cost
->alu
.arith_shift
;
6421 /* LSL (immediate). */
6422 cost
+= extra_cost
->alu
.shift
;
6425 /* Strip extends as we will have costed them in the case above. */
6427 op0
= aarch64_strip_extend (op0
, true);
6429 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
6434 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6435 compound and let the below cases handle it. After all, MNEG is a
6436 special-case alias of MSUB. */
6437 if (GET_CODE (op0
) == NEG
)
6439 op0
= XEXP (op0
, 0);
6443 /* Integer multiplies or FMAs have zero/sign extending variants. */
6444 if ((GET_CODE (op0
) == ZERO_EXTEND
6445 && GET_CODE (op1
) == ZERO_EXTEND
)
6446 || (GET_CODE (op0
) == SIGN_EXTEND
6447 && GET_CODE (op1
) == SIGN_EXTEND
))
6449 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
6450 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
6455 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6456 cost
+= extra_cost
->mult
[0].extend_add
;
6458 /* MUL/SMULL/UMULL. */
6459 cost
+= extra_cost
->mult
[0].extend
;
6465 /* This is either an integer multiply or a MADD. In both cases
6466 we want to recurse and cost the operands. */
6467 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
6468 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
6474 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
6477 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
6486 /* Floating-point FMA/FMUL can also support negations of the
6487 operands, unless the rounding mode is upward or downward in
6488 which case FNMUL is different than FMUL with operand negation. */
6489 bool neg0
= GET_CODE (op0
) == NEG
;
6490 bool neg1
= GET_CODE (op1
) == NEG
;
6491 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
6494 op0
= XEXP (op0
, 0);
6496 op1
= XEXP (op1
, 0);
6500 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6501 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
6504 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
6507 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
6508 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
6514 aarch64_address_cost (rtx x
,
6516 addr_space_t as ATTRIBUTE_UNUSED
,
6519 enum rtx_code c
= GET_CODE (x
);
6520 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
6521 struct aarch64_address_info info
;
6525 if (!aarch64_classify_address (&info
, x
, mode
, c
, false))
6527 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
6529 /* This is a CONST or SYMBOL ref which will be split
6530 in a different way depending on the code model in use.
6531 Cost it through the generic infrastructure. */
6532 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
6533 /* Divide through by the cost of one instruction to
6534 bring it to the same units as the address costs. */
6535 cost_symbol_ref
/= COSTS_N_INSNS (1);
6536 /* The cost is then the cost of preparing the address,
6537 followed by an immediate (possibly 0) offset. */
6538 return cost_symbol_ref
+ addr_cost
->imm_offset
;
6542 /* This is most likely a jump table from a case
6544 return addr_cost
->register_offset
;
6550 case ADDRESS_LO_SUM
:
6551 case ADDRESS_SYMBOLIC
:
6552 case ADDRESS_REG_IMM
:
6553 cost
+= addr_cost
->imm_offset
;
6556 case ADDRESS_REG_WB
:
6557 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
6558 cost
+= addr_cost
->pre_modify
;
6559 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
6560 cost
+= addr_cost
->post_modify
;
6566 case ADDRESS_REG_REG
:
6567 cost
+= addr_cost
->register_offset
;
6570 case ADDRESS_REG_SXTW
:
6571 cost
+= addr_cost
->register_sextend
;
6574 case ADDRESS_REG_UXTW
:
6575 cost
+= addr_cost
->register_zextend
;
6585 /* For the sake of calculating the cost of the shifted register
6586 component, we can treat same sized modes in the same way. */
6587 switch (GET_MODE_BITSIZE (mode
))
6590 cost
+= addr_cost
->addr_scale_costs
.hi
;
6594 cost
+= addr_cost
->addr_scale_costs
.si
;
6598 cost
+= addr_cost
->addr_scale_costs
.di
;
6601 /* We can't tell, or this is a 128-bit vector. */
6603 cost
+= addr_cost
->addr_scale_costs
.ti
;
6611 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6612 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6616 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
6618 /* When optimizing for speed, use the cost of unpredictable branches. */
6619 const struct cpu_branch_cost
*branch_costs
=
6620 aarch64_tune_params
.branch_costs
;
6622 if (!speed_p
|| predictable_p
)
6623 return branch_costs
->predictable
;
6625 return branch_costs
->unpredictable
;
6628 /* Return true if the RTX X in mode MODE is a zero or sign extract
6629 usable in an ADD or SUB (extended register) instruction. */
6631 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
6633 /* Catch add with a sign extract.
6634 This is add_<optab><mode>_multp2. */
6635 if (GET_CODE (x
) == SIGN_EXTRACT
6636 || GET_CODE (x
) == ZERO_EXTRACT
)
6638 rtx op0
= XEXP (x
, 0);
6639 rtx op1
= XEXP (x
, 1);
6640 rtx op2
= XEXP (x
, 2);
6642 if (GET_CODE (op0
) == MULT
6643 && CONST_INT_P (op1
)
6644 && op2
== const0_rtx
6645 && CONST_INT_P (XEXP (op0
, 1))
6646 && aarch64_is_extend_from_extract (mode
,
6653 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6655 else if (GET_CODE (x
) == SIGN_EXTEND
6656 || GET_CODE (x
) == ZERO_EXTEND
)
6657 return REG_P (XEXP (x
, 0));
6663 aarch64_frint_unspec_p (unsigned int u
)
6681 /* Return true iff X is an rtx that will match an extr instruction
6682 i.e. as described in the *extr<mode>5_insn family of patterns.
6683 OP0 and OP1 will be set to the operands of the shifts involved
6684 on success and will be NULL_RTX otherwise. */
6687 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
6690 scalar_int_mode mode
;
6691 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
6694 *res_op0
= NULL_RTX
;
6695 *res_op1
= NULL_RTX
;
6697 if (GET_CODE (x
) != IOR
)
6703 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
6704 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
6706 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6707 if (GET_CODE (op1
) == ASHIFT
)
6708 std::swap (op0
, op1
);
6710 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
6713 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
6714 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
6716 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
6717 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
6719 *res_op0
= XEXP (op0
, 0);
6720 *res_op1
= XEXP (op1
, 0);
6728 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6729 storing it in *COST. Result is true if the total cost of the operation
6730 has now been calculated. */
6732 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
6736 enum rtx_code cmpcode
;
6738 if (COMPARISON_P (op0
))
6740 inner
= XEXP (op0
, 0);
6741 comparator
= XEXP (op0
, 1);
6742 cmpcode
= GET_CODE (op0
);
6747 comparator
= const0_rtx
;
6751 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
6753 /* Conditional branch. */
6754 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6758 if (cmpcode
== NE
|| cmpcode
== EQ
)
6760 if (comparator
== const0_rtx
)
6762 /* TBZ/TBNZ/CBZ/CBNZ. */
6763 if (GET_CODE (inner
) == ZERO_EXTRACT
)
6765 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
6766 ZERO_EXTRACT
, 0, speed
);
6769 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
6774 else if (cmpcode
== LT
|| cmpcode
== GE
)
6777 if (comparator
== const0_rtx
)
6782 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
6785 if (GET_CODE (op1
) == COMPARE
)
6787 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6788 if (XEXP (op1
, 1) == const0_rtx
)
6792 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
6793 const struct cpu_cost_table
*extra_cost
6794 = aarch64_tune_params
.insn_extra_cost
;
6796 if (GET_MODE_CLASS (mode
) == MODE_INT
)
6797 *cost
+= extra_cost
->alu
.arith
;
6799 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
6804 /* It's a conditional operation based on the status flags,
6805 so it must be some flavor of CSEL. */
6807 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6808 if (GET_CODE (op1
) == NEG
6809 || GET_CODE (op1
) == NOT
6810 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
6811 op1
= XEXP (op1
, 0);
6812 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
6814 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6815 op1
= XEXP (op1
, 0);
6816 op2
= XEXP (op2
, 0);
6819 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
6820 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
6824 /* We don't know what this is, cost all operands. */
6828 /* Check whether X is a bitfield operation of the form shift + extend that
6829 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6830 operand to which the bitfield operation is applied. Otherwise return
6834 aarch64_extend_bitfield_pattern_p (rtx x
)
6836 rtx_code outer_code
= GET_CODE (x
);
6837 machine_mode outer_mode
= GET_MODE (x
);
6839 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
6840 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
6843 rtx inner
= XEXP (x
, 0);
6844 rtx_code inner_code
= GET_CODE (inner
);
6845 machine_mode inner_mode
= GET_MODE (inner
);
6851 if (CONST_INT_P (XEXP (inner
, 1))
6852 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6853 op
= XEXP (inner
, 0);
6856 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6857 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6858 op
= XEXP (inner
, 0);
6861 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
6862 && (inner_mode
== QImode
|| inner_mode
== HImode
))
6863 op
= XEXP (inner
, 0);
6872 /* Return true if the mask and a shift amount from an RTX of the form
6873 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6874 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6877 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
6880 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
6881 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
6882 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
6883 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
6886 /* Calculate the cost of calculating X, storing it in *COST. Result
6887 is true if the total cost of the operation has now been calculated. */
6889 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
6890 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
6893 const struct cpu_cost_table
*extra_cost
6894 = aarch64_tune_params
.insn_extra_cost
;
6895 int code
= GET_CODE (x
);
6896 scalar_int_mode int_mode
;
6898 /* By default, assume that everything has equivalent cost to the
6899 cheapest instruction. Any additional costs are applied as a delta
6900 above this default. */
6901 *cost
= COSTS_N_INSNS (1);
6906 /* The cost depends entirely on the operands to SET. */
6911 switch (GET_CODE (op0
))
6916 rtx address
= XEXP (op0
, 0);
6917 if (VECTOR_MODE_P (mode
))
6918 *cost
+= extra_cost
->ldst
.storev
;
6919 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
6920 *cost
+= extra_cost
->ldst
.store
;
6921 else if (mode
== SFmode
)
6922 *cost
+= extra_cost
->ldst
.storef
;
6923 else if (mode
== DFmode
)
6924 *cost
+= extra_cost
->ldst
.stored
;
6927 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
6931 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6935 if (! REG_P (SUBREG_REG (op0
)))
6936 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
6940 /* The cost is one per vector-register copied. */
6941 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
6943 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6944 / GET_MODE_SIZE (V4SImode
);
6945 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6947 /* const0_rtx is in general free, but we will use an
6948 instruction to set a register to 0. */
6949 else if (REG_P (op1
) || op1
== const0_rtx
)
6951 /* The cost is 1 per register copied. */
6952 int n_minus_1
= (GET_MODE_SIZE (GET_MODE (op0
)) - 1)
6954 *cost
= COSTS_N_INSNS (n_minus_1
+ 1);
6957 /* Cost is just the cost of the RHS of the set. */
6958 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
6963 /* Bit-field insertion. Strip any redundant widening of
6964 the RHS to meet the width of the target. */
6965 if (GET_CODE (op1
) == SUBREG
)
6966 op1
= SUBREG_REG (op1
);
6967 if ((GET_CODE (op1
) == ZERO_EXTEND
6968 || GET_CODE (op1
) == SIGN_EXTEND
)
6969 && CONST_INT_P (XEXP (op0
, 1))
6970 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
6971 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
6972 op1
= XEXP (op1
, 0);
6974 if (CONST_INT_P (op1
))
6976 /* MOV immediate is assumed to always be cheap. */
6977 *cost
= COSTS_N_INSNS (1);
6983 *cost
+= extra_cost
->alu
.bfi
;
6984 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
6990 /* We can't make sense of this, assume default cost. */
6991 *cost
= COSTS_N_INSNS (1);
6997 /* If an instruction can incorporate a constant within the
6998 instruction, the instruction's expression avoids calling
6999 rtx_cost() on the constant. If rtx_cost() is called on a
7000 constant, then it is usually because the constant must be
7001 moved into a register by one or more instructions.
7003 The exception is constant 0, which can be expressed
7004 as XZR/WZR and is therefore free. The exception to this is
7005 if we have (set (reg) (const0_rtx)) in which case we must cost
7006 the move. However, we can catch that when we cost the SET, so
7007 we don't need to consider that here. */
7008 if (x
== const0_rtx
)
7012 /* To an approximation, building any other constant is
7013 proportionally expensive to the number of instructions
7014 required to build that constant. This is true whether we
7015 are compiling for SPEED or otherwise. */
7016 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7017 int_mode
= word_mode
;
7018 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
7019 (NULL_RTX
, x
, false, int_mode
));
7025 /* First determine number of instructions to do the move
7026 as an integer constant. */
7027 if (!aarch64_float_const_representable_p (x
)
7028 && !aarch64_can_const_movi_rtx_p (x
, mode
)
7029 && aarch64_float_const_rtx_p (x
))
7031 unsigned HOST_WIDE_INT ival
;
7032 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
7033 gcc_assert (succeed
);
7035 scalar_int_mode imode
= (mode
== HFmode
7037 : int_mode_for_mode (mode
).require ());
7038 int ncost
= aarch64_internal_mov_immediate
7039 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7040 *cost
+= COSTS_N_INSNS (ncost
);
7046 /* mov[df,sf]_aarch64. */
7047 if (aarch64_float_const_representable_p (x
))
7048 /* FMOV (scalar immediate). */
7049 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
7050 else if (!aarch64_float_const_zero_rtx_p (x
))
7052 /* This will be a load from memory. */
7054 *cost
+= extra_cost
->ldst
.loadd
;
7056 *cost
+= extra_cost
->ldst
.loadf
;
7059 /* Otherwise this is +0.0. We get this using MOVI d0, #0
7060 or MOV v0.s[0], wzr - neither of which are modeled by the
7061 cost tables. Just use the default cost. */
7071 /* For loads we want the base cost of a load, plus an
7072 approximation for the additional cost of the addressing
7074 rtx address
= XEXP (x
, 0);
7075 if (VECTOR_MODE_P (mode
))
7076 *cost
+= extra_cost
->ldst
.loadv
;
7077 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7078 *cost
+= extra_cost
->ldst
.load
;
7079 else if (mode
== SFmode
)
7080 *cost
+= extra_cost
->ldst
.loadf
;
7081 else if (mode
== DFmode
)
7082 *cost
+= extra_cost
->ldst
.loadd
;
7085 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
7094 if (VECTOR_MODE_P (mode
))
7099 *cost
+= extra_cost
->vect
.alu
;
7104 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7106 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
7107 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
7110 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
7114 /* Cost this as SUB wzr, X. */
7115 op0
= CONST0_RTX (mode
);
7120 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7122 /* Support (neg(fma...)) as a single instruction only if
7123 sign of zeros is unimportant. This matches the decision
7124 making in aarch64.md. */
7125 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
7128 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
7131 if (GET_CODE (op0
) == MULT
)
7134 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
7139 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
7149 if (VECTOR_MODE_P (mode
))
7150 *cost
+= extra_cost
->vect
.alu
;
7152 *cost
+= extra_cost
->alu
.clz
;
7161 if (op1
== const0_rtx
7162 && GET_CODE (op0
) == AND
)
7165 mode
= GET_MODE (op0
);
7169 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
7171 /* TODO: A write to the CC flags possibly costs extra, this
7172 needs encoding in the cost tables. */
7174 mode
= GET_MODE (op0
);
7176 if (GET_CODE (op0
) == AND
)
7182 if (GET_CODE (op0
) == PLUS
)
7184 /* ADDS (and CMN alias). */
7189 if (GET_CODE (op0
) == MINUS
)
7196 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
7197 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
7198 && CONST_INT_P (XEXP (op0
, 2)))
7200 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
7201 Handle it here directly rather than going to cost_logic
7202 since we know the immediate generated for the TST is valid
7203 so we can avoid creating an intermediate rtx for it only
7204 for costing purposes. */
7206 *cost
+= extra_cost
->alu
.logical
;
7208 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
7209 ZERO_EXTRACT
, 0, speed
);
7213 if (GET_CODE (op1
) == NEG
)
7217 *cost
+= extra_cost
->alu
.arith
;
7219 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
7220 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
7226 Compare can freely swap the order of operands, and
7227 canonicalization puts the more complex operation first.
7228 But the integer MINUS logic expects the shift/extend
7229 operation in op1. */
7231 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
7239 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
7243 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
7245 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
7247 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
7248 /* FCMP supports constant 0.0 for no extra cost. */
7254 if (VECTOR_MODE_P (mode
))
7256 /* Vector compare. */
7258 *cost
+= extra_cost
->vect
.alu
;
7260 if (aarch64_float_const_zero_rtx_p (op1
))
7262 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
7276 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
7278 /* Detect valid immediates. */
7279 if ((GET_MODE_CLASS (mode
) == MODE_INT
7280 || (GET_MODE_CLASS (mode
) == MODE_CC
7281 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
7282 && CONST_INT_P (op1
)
7283 && aarch64_uimm12_shift (INTVAL (op1
)))
7286 /* SUB(S) (immediate). */
7287 *cost
+= extra_cost
->alu
.arith
;
7291 /* Look for SUB (extended register). */
7292 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
7293 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
7296 *cost
+= extra_cost
->alu
.extend_arith
;
7298 op1
= aarch64_strip_extend (op1
, true);
7299 *cost
+= rtx_cost (op1
, VOIDmode
,
7300 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
7304 rtx new_op1
= aarch64_strip_extend (op1
, false);
7306 /* Cost this as an FMA-alike operation. */
7307 if ((GET_CODE (new_op1
) == MULT
7308 || aarch64_shift_p (GET_CODE (new_op1
)))
7311 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
7312 (enum rtx_code
) code
,
7317 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
7321 if (VECTOR_MODE_P (mode
))
7324 *cost
+= extra_cost
->vect
.alu
;
7326 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7329 *cost
+= extra_cost
->alu
.arith
;
7331 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7334 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7348 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
7349 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
7352 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
7353 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
7357 if (GET_MODE_CLASS (mode
) == MODE_INT
7358 && CONST_INT_P (op1
)
7359 && aarch64_uimm12_shift (INTVAL (op1
)))
7361 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
7364 /* ADD (immediate). */
7365 *cost
+= extra_cost
->alu
.arith
;
7369 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
7371 /* Look for ADD (extended register). */
7372 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
7373 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
7376 *cost
+= extra_cost
->alu
.extend_arith
;
7378 op0
= aarch64_strip_extend (op0
, true);
7379 *cost
+= rtx_cost (op0
, VOIDmode
,
7380 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
7384 /* Strip any extend, leave shifts behind as we will
7385 cost them through mult_cost. */
7386 new_op0
= aarch64_strip_extend (op0
, false);
7388 if (GET_CODE (new_op0
) == MULT
7389 || aarch64_shift_p (GET_CODE (new_op0
)))
7391 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
7396 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
7400 if (VECTOR_MODE_P (mode
))
7403 *cost
+= extra_cost
->vect
.alu
;
7405 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7408 *cost
+= extra_cost
->alu
.arith
;
7410 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
7413 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
7420 *cost
= COSTS_N_INSNS (1);
7424 if (VECTOR_MODE_P (mode
))
7425 *cost
+= extra_cost
->vect
.alu
;
7427 *cost
+= extra_cost
->alu
.rev
;
7432 if (aarch_rev16_p (x
))
7434 *cost
= COSTS_N_INSNS (1);
7438 if (VECTOR_MODE_P (mode
))
7439 *cost
+= extra_cost
->vect
.alu
;
7441 *cost
+= extra_cost
->alu
.rev
;
7446 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
7448 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
7449 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
7451 *cost
+= extra_cost
->alu
.shift
;
7462 if (VECTOR_MODE_P (mode
))
7465 *cost
+= extra_cost
->vect
.alu
;
7470 && GET_CODE (op0
) == MULT
7471 && CONST_INT_P (XEXP (op0
, 1))
7472 && CONST_INT_P (op1
)
7473 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
7476 /* This is a UBFM/SBFM. */
7477 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
7479 *cost
+= extra_cost
->alu
.bfx
;
7483 if (is_int_mode (mode
, &int_mode
))
7485 if (CONST_INT_P (op1
))
7487 /* We have a mask + shift version of a UBFIZ
7488 i.e. the *andim_ashift<mode>_bfiz pattern. */
7489 if (GET_CODE (op0
) == ASHIFT
7490 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
7493 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
7494 (enum rtx_code
) code
, 0, speed
);
7496 *cost
+= extra_cost
->alu
.bfx
;
7500 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
7502 /* We possibly get the immediate for free, this is not
7504 *cost
+= rtx_cost (op0
, int_mode
,
7505 (enum rtx_code
) code
, 0, speed
);
7507 *cost
+= extra_cost
->alu
.logical
;
7516 /* Handle ORN, EON, or BIC. */
7517 if (GET_CODE (op0
) == NOT
)
7518 op0
= XEXP (op0
, 0);
7520 new_op0
= aarch64_strip_shift (op0
);
7522 /* If we had a shift on op0 then this is a logical-shift-
7523 by-register/immediate operation. Otherwise, this is just
7524 a logical operation. */
7529 /* Shift by immediate. */
7530 if (CONST_INT_P (XEXP (op0
, 1)))
7531 *cost
+= extra_cost
->alu
.log_shift
;
7533 *cost
+= extra_cost
->alu
.log_shift_reg
;
7536 *cost
+= extra_cost
->alu
.logical
;
7539 /* In both cases we want to cost both operands. */
7540 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
7542 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
7552 op0
= aarch64_strip_shift (x
);
7554 if (VECTOR_MODE_P (mode
))
7557 *cost
+= extra_cost
->vect
.alu
;
7561 /* MVN-shifted-reg. */
7564 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7567 *cost
+= extra_cost
->alu
.log_shift
;
7571 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7572 Handle the second form here taking care that 'a' in the above can
7574 else if (GET_CODE (op0
) == XOR
)
7576 rtx newop0
= XEXP (op0
, 0);
7577 rtx newop1
= XEXP (op0
, 1);
7578 rtx op0_stripped
= aarch64_strip_shift (newop0
);
7580 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
7581 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
7585 if (op0_stripped
!= newop0
)
7586 *cost
+= extra_cost
->alu
.log_shift
;
7588 *cost
+= extra_cost
->alu
.logical
;
7595 *cost
+= extra_cost
->alu
.logical
;
7602 /* If a value is written in SI mode, then zero extended to DI
7603 mode, the operation will in general be free as a write to
7604 a 'w' register implicitly zeroes the upper bits of an 'x'
7605 register. However, if this is
7607 (set (reg) (zero_extend (reg)))
7609 we must cost the explicit register move. */
7611 && GET_MODE (op0
) == SImode
7614 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
7616 /* If OP_COST is non-zero, then the cost of the zero extend
7617 is effectively the cost of the inner operation. Otherwise
7618 we have a MOV instruction and we take the cost from the MOV
7619 itself. This is true independently of whether we are
7620 optimizing for space or time. */
7626 else if (MEM_P (op0
))
7628 /* All loads can zero extend to any size for free. */
7629 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
7633 op0
= aarch64_extend_bitfield_pattern_p (x
);
7636 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
7638 *cost
+= extra_cost
->alu
.bfx
;
7644 if (VECTOR_MODE_P (mode
))
7647 *cost
+= extra_cost
->vect
.alu
;
7651 /* We generate an AND instead of UXTB/UXTH. */
7652 *cost
+= extra_cost
->alu
.logical
;
7658 if (MEM_P (XEXP (x
, 0)))
7663 rtx address
= XEXP (XEXP (x
, 0), 0);
7664 *cost
+= extra_cost
->ldst
.load_sign_extend
;
7667 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
7673 op0
= aarch64_extend_bitfield_pattern_p (x
);
7676 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
7678 *cost
+= extra_cost
->alu
.bfx
;
7684 if (VECTOR_MODE_P (mode
))
7685 *cost
+= extra_cost
->vect
.alu
;
7687 *cost
+= extra_cost
->alu
.extend
;
7695 if (CONST_INT_P (op1
))
7699 if (VECTOR_MODE_P (mode
))
7701 /* Vector shift (immediate). */
7702 *cost
+= extra_cost
->vect
.alu
;
7706 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7708 *cost
+= extra_cost
->alu
.shift
;
7712 /* We can incorporate zero/sign extend for free. */
7713 if (GET_CODE (op0
) == ZERO_EXTEND
7714 || GET_CODE (op0
) == SIGN_EXTEND
)
7715 op0
= XEXP (op0
, 0);
7717 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
7722 if (VECTOR_MODE_P (mode
))
7725 /* Vector shift (register). */
7726 *cost
+= extra_cost
->vect
.alu
;
7732 *cost
+= extra_cost
->alu
.shift_reg
;
7734 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
7735 && CONST_INT_P (XEXP (op1
, 1))
7736 && INTVAL (XEXP (op1
, 1)) == GET_MODE_BITSIZE (mode
) - 1)
7738 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
7739 /* We already demanded XEXP (op1, 0) to be REG_P, so
7740 don't recurse into it. */
7744 return false; /* All arguments need to be in registers. */
7754 if (CONST_INT_P (op1
))
7756 /* ASR (immediate) and friends. */
7759 if (VECTOR_MODE_P (mode
))
7760 *cost
+= extra_cost
->vect
.alu
;
7762 *cost
+= extra_cost
->alu
.shift
;
7765 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
7770 if (VECTOR_MODE_P (mode
))
7773 /* Vector shift (register). */
7774 *cost
+= extra_cost
->vect
.alu
;
7779 /* ASR (register) and friends. */
7780 *cost
+= extra_cost
->alu
.shift_reg
;
7782 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
7783 && CONST_INT_P (XEXP (op1
, 1))
7784 && INTVAL (XEXP (op1
, 1)) == GET_MODE_BITSIZE (mode
) - 1)
7786 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
7787 /* We already demanded XEXP (op1, 0) to be REG_P, so
7788 don't recurse into it. */
7792 return false; /* All arguments need to be in registers. */
7797 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
7798 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
7802 *cost
+= extra_cost
->ldst
.load
;
7804 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
7805 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
7807 /* ADRP, followed by ADD. */
7808 *cost
+= COSTS_N_INSNS (1);
7810 *cost
+= 2 * extra_cost
->alu
.arith
;
7812 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
7813 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
7817 *cost
+= extra_cost
->alu
.arith
;
7822 /* One extra load instruction, after accessing the GOT. */
7823 *cost
+= COSTS_N_INSNS (1);
7825 *cost
+= extra_cost
->ldst
.load
;
7831 /* ADRP/ADD (immediate). */
7833 *cost
+= extra_cost
->alu
.arith
;
7841 if (VECTOR_MODE_P (mode
))
7842 *cost
+= extra_cost
->vect
.alu
;
7844 *cost
+= extra_cost
->alu
.bfx
;
7847 /* We can trust that the immediates used will be correct (there
7848 are no by-register forms), so we need only cost op0. */
7849 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
7853 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
7854 /* aarch64_rtx_mult_cost always handles recursion to its
7859 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7860 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7861 an unconditional negate. This case should only ever be reached through
7862 the set_smod_pow2_cheap check in expmed.c. */
7863 if (CONST_INT_P (XEXP (x
, 1))
7864 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
7865 && (mode
== SImode
|| mode
== DImode
))
7867 /* We expand to 4 instructions. Reset the baseline. */
7868 *cost
= COSTS_N_INSNS (4);
7871 *cost
+= 2 * extra_cost
->alu
.logical
7872 + 2 * extra_cost
->alu
.arith
;
7881 /* Slighly prefer UMOD over SMOD. */
7882 if (VECTOR_MODE_P (mode
))
7883 *cost
+= extra_cost
->vect
.alu
;
7884 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7885 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
7886 + extra_cost
->mult
[mode
== DImode
].idiv
7887 + (code
== MOD
? 1 : 0));
7889 return false; /* All arguments need to be in registers. */
7896 if (VECTOR_MODE_P (mode
))
7897 *cost
+= extra_cost
->vect
.alu
;
7898 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
7899 /* There is no integer SQRT, so only DIV and UDIV can get
7901 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
7902 /* Slighly prefer UDIV over SDIV. */
7903 + (code
== DIV
? 1 : 0));
7905 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
7907 return false; /* All arguments need to be in registers. */
7910 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
7911 XEXP (x
, 2), cost
, speed
);
7924 return false; /* All arguments must be in registers. */
7933 if (VECTOR_MODE_P (mode
))
7934 *cost
+= extra_cost
->vect
.alu
;
7936 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
7939 /* FMSUB, FNMADD, and FNMSUB are free. */
7940 if (GET_CODE (op0
) == NEG
)
7941 op0
= XEXP (op0
, 0);
7943 if (GET_CODE (op2
) == NEG
)
7944 op2
= XEXP (op2
, 0);
7946 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7947 and the by-element operand as operand 0. */
7948 if (GET_CODE (op1
) == NEG
)
7949 op1
= XEXP (op1
, 0);
7951 /* Catch vector-by-element operations. The by-element operand can
7952 either be (vec_duplicate (vec_select (x))) or just
7953 (vec_select (x)), depending on whether we are multiplying by
7954 a vector or a scalar.
7956 Canonicalization is not very good in these cases, FMA4 will put the
7957 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7958 if (GET_CODE (op0
) == VEC_DUPLICATE
)
7959 op0
= XEXP (op0
, 0);
7960 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
7961 op1
= XEXP (op1
, 0);
7963 if (GET_CODE (op0
) == VEC_SELECT
)
7964 op0
= XEXP (op0
, 0);
7965 else if (GET_CODE (op1
) == VEC_SELECT
)
7966 op1
= XEXP (op1
, 0);
7968 /* If the remaining parameters are not registers,
7969 get the cost to put them into registers. */
7970 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
7971 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
7972 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
7976 case UNSIGNED_FLOAT
:
7978 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
7984 if (VECTOR_MODE_P (mode
))
7986 /*Vector truncate. */
7987 *cost
+= extra_cost
->vect
.alu
;
7990 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
7994 case FLOAT_TRUNCATE
:
7997 if (VECTOR_MODE_P (mode
))
7999 /*Vector conversion. */
8000 *cost
+= extra_cost
->vect
.alu
;
8003 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
8010 /* Strip the rounding part. They will all be implemented
8011 by the fcvt* family of instructions anyway. */
8012 if (GET_CODE (x
) == UNSPEC
)
8014 unsigned int uns_code
= XINT (x
, 1);
8016 if (uns_code
== UNSPEC_FRINTA
8017 || uns_code
== UNSPEC_FRINTM
8018 || uns_code
== UNSPEC_FRINTN
8019 || uns_code
== UNSPEC_FRINTP
8020 || uns_code
== UNSPEC_FRINTZ
)
8021 x
= XVECEXP (x
, 0, 0);
8026 if (VECTOR_MODE_P (mode
))
8027 *cost
+= extra_cost
->vect
.alu
;
8029 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
8032 /* We can combine fmul by a power of 2 followed by a fcvt into a single
8033 fixed-point fcvt. */
8034 if (GET_CODE (x
) == MULT
8035 && ((VECTOR_MODE_P (mode
)
8036 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
8037 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
8039 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
8044 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
8048 if (VECTOR_MODE_P (mode
))
8052 *cost
+= extra_cost
->vect
.alu
;
8054 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8058 /* FABD, which is analogous to FADD. */
8059 if (GET_CODE (op0
) == MINUS
)
8061 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
8062 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
8064 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8068 /* Simple FABS is analogous to FNEG. */
8070 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8074 /* Integer ABS will either be split to
8075 two arithmetic instructions, or will be an ABS
8076 (scalar), which we don't model. */
8077 *cost
= COSTS_N_INSNS (2);
8079 *cost
+= 2 * extra_cost
->alu
.arith
;
8087 if (VECTOR_MODE_P (mode
))
8088 *cost
+= extra_cost
->vect
.alu
;
8091 /* FMAXNM/FMINNM/FMAX/FMIN.
8092 TODO: This may not be accurate for all implementations, but
8093 we do not model this in the cost tables. */
8094 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8100 /* The floating point round to integer frint* instructions. */
8101 if (aarch64_frint_unspec_p (XINT (x
, 1)))
8104 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
8109 if (XINT (x
, 1) == UNSPEC_RBIT
)
8112 *cost
+= extra_cost
->alu
.rev
;
8120 /* Decompose <su>muldi3_highpart. */
8121 if (/* (truncate:DI */
8124 && GET_MODE (XEXP (x
, 0)) == TImode
8125 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
8127 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
8128 /* (ANY_EXTEND:TI (reg:DI))
8129 (ANY_EXTEND:TI (reg:DI))) */
8130 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
8131 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
8132 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
8133 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
8134 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
8135 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
8136 /* (const_int 64) */
8137 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8138 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
8142 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
8143 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
8144 mode
, MULT
, 0, speed
);
8145 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
8146 mode
, MULT
, 1, speed
);
8156 && flag_aarch64_verbose_cost
)
8158 "\nFailed to cost RTX. Assuming default cost.\n");
8163 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
8164 calculated for X. This cost is stored in *COST. Returns true
8165 if the total cost of X was calculated. */
8167 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
8168 int param
, int *cost
, bool speed
)
8170 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
8173 && flag_aarch64_verbose_cost
)
8175 print_rtl_single (dump_file
, x
);
8176 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
8177 speed
? "Hot" : "Cold",
8178 *cost
, result
? "final" : "partial");
8185 aarch64_register_move_cost (machine_mode mode
,
8186 reg_class_t from_i
, reg_class_t to_i
)
8188 enum reg_class from
= (enum reg_class
) from_i
;
8189 enum reg_class to
= (enum reg_class
) to_i
;
8190 const struct cpu_regmove_cost
*regmove_cost
8191 = aarch64_tune_params
.regmove_cost
;
8193 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
8194 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
8197 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
8198 from
= GENERAL_REGS
;
8200 /* Moving between GPR and stack cost is the same as GP2GP. */
8201 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
8202 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
8203 return regmove_cost
->GP2GP
;
8205 /* To/From the stack register, we move via the gprs. */
8206 if (to
== STACK_REG
|| from
== STACK_REG
)
8207 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
8208 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
8210 if (GET_MODE_SIZE (mode
) == 16)
8212 /* 128-bit operations on general registers require 2 instructions. */
8213 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
8214 return regmove_cost
->GP2GP
* 2;
8215 else if (from
== GENERAL_REGS
)
8216 return regmove_cost
->GP2FP
* 2;
8217 else if (to
== GENERAL_REGS
)
8218 return regmove_cost
->FP2GP
* 2;
8220 /* When AdvSIMD instructions are disabled it is not possible to move
8221 a 128-bit value directly between Q registers. This is handled in
8222 secondary reload. A general register is used as a scratch to move
8223 the upper DI value and the lower DI value is moved directly,
8224 hence the cost is the sum of three moves. */
8226 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
8228 return regmove_cost
->FP2FP
;
8231 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
8232 return regmove_cost
->GP2GP
;
8233 else if (from
== GENERAL_REGS
)
8234 return regmove_cost
->GP2FP
;
8235 else if (to
== GENERAL_REGS
)
8236 return regmove_cost
->FP2GP
;
8238 return regmove_cost
->FP2FP
;
8242 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
8243 reg_class_t rclass ATTRIBUTE_UNUSED
,
8244 bool in ATTRIBUTE_UNUSED
)
8246 return aarch64_tune_params
.memmov_cost
;
8249 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
8250 to optimize 1.0/sqrt. */
8253 use_rsqrt_p (machine_mode mode
)
8255 return (!flag_trapping_math
8256 && flag_unsafe_math_optimizations
8257 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
8258 & AARCH64_APPROX_MODE (mode
))
8259 || flag_mrecip_low_precision_sqrt
));
8262 /* Function to decide when to use the approximate reciprocal square root
8266 aarch64_builtin_reciprocal (tree fndecl
)
8268 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
8270 if (!use_rsqrt_p (mode
))
8272 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
8275 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
8277 /* Select reciprocal square root initial estimate insn depending on machine
8281 get_rsqrte_type (machine_mode mode
)
8285 case E_DFmode
: return gen_aarch64_rsqrtedf
;
8286 case E_SFmode
: return gen_aarch64_rsqrtesf
;
8287 case E_V2DFmode
: return gen_aarch64_rsqrtev2df
;
8288 case E_V2SFmode
: return gen_aarch64_rsqrtev2sf
;
8289 case E_V4SFmode
: return gen_aarch64_rsqrtev4sf
;
8290 default: gcc_unreachable ();
8294 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
8296 /* Select reciprocal square root series step insn depending on machine mode. */
8299 get_rsqrts_type (machine_mode mode
)
8303 case E_DFmode
: return gen_aarch64_rsqrtsdf
;
8304 case E_SFmode
: return gen_aarch64_rsqrtssf
;
8305 case E_V2DFmode
: return gen_aarch64_rsqrtsv2df
;
8306 case E_V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
8307 case E_V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
8308 default: gcc_unreachable ();
8312 /* Emit instruction sequence to compute either the approximate square root
8313 or its approximate reciprocal, depending on the flag RECP, and return
8314 whether the sequence was emitted or not. */
8317 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
8319 machine_mode mode
= GET_MODE (dst
);
8321 if (GET_MODE_INNER (mode
) == HFmode
)
8329 if (!(flag_mlow_precision_sqrt
8330 || (aarch64_tune_params
.approx_modes
->sqrt
8331 & AARCH64_APPROX_MODE (mode
))))
8334 if (flag_finite_math_only
8335 || flag_trapping_math
8336 || !flag_unsafe_math_optimizations
8337 || optimize_function_for_size_p (cfun
))
8341 /* Caller assumes we cannot fail. */
8342 gcc_assert (use_rsqrt_p (mode
));
8344 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
8345 rtx xmsk
= gen_reg_rtx (mmsk
);
8347 /* When calculating the approximate square root, compare the
8348 argument with 0.0 and create a mask. */
8349 emit_insn (gen_rtx_SET (xmsk
,
8351 gen_rtx_EQ (mmsk
, src
,
8352 CONST0_RTX (mode
)))));
8354 /* Estimate the approximate reciprocal square root. */
8355 rtx xdst
= gen_reg_rtx (mode
);
8356 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
8358 /* Iterate over the series twice for SF and thrice for DF. */
8359 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
8361 /* Optionally iterate over the series once less for faster performance
8362 while sacrificing the accuracy. */
8363 if ((recp
&& flag_mrecip_low_precision_sqrt
)
8364 || (!recp
&& flag_mlow_precision_sqrt
))
8367 /* Iterate over the series to calculate the approximate reciprocal square
8369 rtx x1
= gen_reg_rtx (mode
);
8370 while (iterations
--)
8372 rtx x2
= gen_reg_rtx (mode
);
8373 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
8375 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
8378 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
8383 /* Qualify the approximate reciprocal square root when the argument is
8384 0.0 by squashing the intermediary result to 0.0. */
8385 rtx xtmp
= gen_reg_rtx (mmsk
);
8386 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
8387 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
8388 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
8390 /* Calculate the approximate square root. */
8391 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
8394 /* Finalize the approximation. */
8395 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
8400 typedef rtx (*recpe_type
) (rtx
, rtx
);
8402 /* Select reciprocal initial estimate insn depending on machine mode. */
8405 get_recpe_type (machine_mode mode
)
8409 case E_SFmode
: return (gen_aarch64_frecpesf
);
8410 case E_V2SFmode
: return (gen_aarch64_frecpev2sf
);
8411 case E_V4SFmode
: return (gen_aarch64_frecpev4sf
);
8412 case E_DFmode
: return (gen_aarch64_frecpedf
);
8413 case E_V2DFmode
: return (gen_aarch64_frecpev2df
);
8414 default: gcc_unreachable ();
8418 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
8420 /* Select reciprocal series step insn depending on machine mode. */
8423 get_recps_type (machine_mode mode
)
8427 case E_SFmode
: return (gen_aarch64_frecpssf
);
8428 case E_V2SFmode
: return (gen_aarch64_frecpsv2sf
);
8429 case E_V4SFmode
: return (gen_aarch64_frecpsv4sf
);
8430 case E_DFmode
: return (gen_aarch64_frecpsdf
);
8431 case E_V2DFmode
: return (gen_aarch64_frecpsv2df
);
8432 default: gcc_unreachable ();
8436 /* Emit the instruction sequence to compute the approximation for the division
8437 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8440 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
8442 machine_mode mode
= GET_MODE (quo
);
8444 if (GET_MODE_INNER (mode
) == HFmode
)
8447 bool use_approx_division_p
= (flag_mlow_precision_div
8448 || (aarch64_tune_params
.approx_modes
->division
8449 & AARCH64_APPROX_MODE (mode
)));
8451 if (!flag_finite_math_only
8452 || flag_trapping_math
8453 || !flag_unsafe_math_optimizations
8454 || optimize_function_for_size_p (cfun
)
8455 || !use_approx_division_p
)
8458 /* Estimate the approximate reciprocal. */
8459 rtx xrcp
= gen_reg_rtx (mode
);
8460 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
8462 /* Iterate over the series twice for SF and thrice for DF. */
8463 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
8465 /* Optionally iterate over the series once less for faster performance,
8466 while sacrificing the accuracy. */
8467 if (flag_mlow_precision_div
)
8470 /* Iterate over the series to calculate the approximate reciprocal. */
8471 rtx xtmp
= gen_reg_rtx (mode
);
8472 while (iterations
--)
8474 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
8477 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
8480 if (num
!= CONST1_RTX (mode
))
8482 /* As the approximate reciprocal of DEN is already calculated, only
8483 calculate the approximate division when NUM is not 1.0. */
8484 rtx xnum
= force_reg (mode
, num
);
8485 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
8488 /* Finalize the approximation. */
8489 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
8493 /* Return the number of instructions that can be issued per cycle. */
8495 aarch64_sched_issue_rate (void)
8497 return aarch64_tune_params
.issue_rate
;
8501 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8503 int issue_rate
= aarch64_sched_issue_rate ();
8505 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
8509 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8510 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8511 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8514 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
8517 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
8521 /* Vectorizer cost model target hooks. */
8523 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8525 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
8527 int misalign ATTRIBUTE_UNUSED
)
8530 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
8533 if (vectype
!= NULL
)
8534 fp
= FLOAT_TYPE_P (vectype
);
8536 switch (type_of_cost
)
8539 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
8542 return costs
->scalar_load_cost
;
8545 return costs
->scalar_store_cost
;
8548 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
8551 return costs
->vec_align_load_cost
;
8554 return costs
->vec_store_cost
;
8557 return costs
->vec_to_scalar_cost
;
8560 return costs
->scalar_to_vec_cost
;
8562 case unaligned_load
:
8563 case vector_gather_load
:
8564 return costs
->vec_unalign_load_cost
;
8566 case unaligned_store
:
8567 case vector_scatter_store
:
8568 return costs
->vec_unalign_store_cost
;
8570 case cond_branch_taken
:
8571 return costs
->cond_taken_branch_cost
;
8573 case cond_branch_not_taken
:
8574 return costs
->cond_not_taken_branch_cost
;
8577 return costs
->vec_permute_cost
;
8579 case vec_promote_demote
:
8580 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
8583 elements
= TYPE_VECTOR_SUBPARTS (vectype
);
8584 return elements
/ 2 + 1;
8591 /* Implement targetm.vectorize.add_stmt_cost. */
8593 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
8594 struct _stmt_vec_info
*stmt_info
, int misalign
,
8595 enum vect_cost_model_location where
)
8597 unsigned *cost
= (unsigned *) data
;
8598 unsigned retval
= 0;
8600 if (flag_vect_cost_model
)
8602 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
8604 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
8606 /* Statements in an inner loop relative to the loop being
8607 vectorized are weighted more heavily. The value here is
8608 arbitrary and could potentially be improved with analysis. */
8609 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
8610 count
*= 50; /* FIXME */
8612 retval
= (unsigned) (count
* stmt_cost
);
8613 cost
[where
] += retval
;
8619 static void initialize_aarch64_code_model (struct gcc_options
*);
8621 /* Parse the TO_PARSE string and put the architecture struct that it
8622 selects into RES and the architectural features into ISA_FLAGS.
8623 Return an aarch64_parse_opt_result describing the parse result.
8624 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8626 static enum aarch64_parse_opt_result
8627 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
8628 unsigned long *isa_flags
)
8631 const struct processor
*arch
;
8632 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8635 strcpy (str
, to_parse
);
8637 ext
= strchr (str
, '+');
8645 return AARCH64_PARSE_MISSING_ARG
;
8648 /* Loop through the list of supported ARCHes to find a match. */
8649 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
8651 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
8653 unsigned long isa_temp
= arch
->flags
;
8657 /* TO_PARSE string contains at least one extension. */
8658 enum aarch64_parse_opt_result ext_res
8659 = aarch64_parse_extension (ext
, &isa_temp
);
8661 if (ext_res
!= AARCH64_PARSE_OK
)
8664 /* Extension parsing was successful. Confirm the result
8665 arch and ISA flags. */
8667 *isa_flags
= isa_temp
;
8668 return AARCH64_PARSE_OK
;
8672 /* ARCH name not found in list. */
8673 return AARCH64_PARSE_INVALID_ARG
;
8676 /* Parse the TO_PARSE string and put the result tuning in RES and the
8677 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8678 describing the parse result. If there is an error parsing, RES and
8679 ISA_FLAGS are left unchanged. */
8681 static enum aarch64_parse_opt_result
8682 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
8683 unsigned long *isa_flags
)
8686 const struct processor
*cpu
;
8687 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8690 strcpy (str
, to_parse
);
8692 ext
= strchr (str
, '+');
8700 return AARCH64_PARSE_MISSING_ARG
;
8703 /* Loop through the list of supported CPUs to find a match. */
8704 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8706 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
8708 unsigned long isa_temp
= cpu
->flags
;
8713 /* TO_PARSE string contains at least one extension. */
8714 enum aarch64_parse_opt_result ext_res
8715 = aarch64_parse_extension (ext
, &isa_temp
);
8717 if (ext_res
!= AARCH64_PARSE_OK
)
8720 /* Extension parsing was successfull. Confirm the result
8721 cpu and ISA flags. */
8723 *isa_flags
= isa_temp
;
8724 return AARCH64_PARSE_OK
;
8728 /* CPU name not found in list. */
8729 return AARCH64_PARSE_INVALID_ARG
;
8732 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8733 Return an aarch64_parse_opt_result describing the parse result.
8734 If the parsing fails the RES does not change. */
8736 static enum aarch64_parse_opt_result
8737 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
8739 const struct processor
*cpu
;
8740 char *str
= (char *) alloca (strlen (to_parse
) + 1);
8742 strcpy (str
, to_parse
);
8744 /* Loop through the list of supported CPUs to find a match. */
8745 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
8747 if (strcmp (cpu
->name
, str
) == 0)
8750 return AARCH64_PARSE_OK
;
8754 /* CPU name not found in list. */
8755 return AARCH64_PARSE_INVALID_ARG
;
8758 /* Parse TOKEN, which has length LENGTH to see if it is an option
8759 described in FLAG. If it is, return the index bit for that fusion type.
8760 If not, error (printing OPTION_NAME) and return zero. */
8763 aarch64_parse_one_option_token (const char *token
,
8765 const struct aarch64_flag_desc
*flag
,
8766 const char *option_name
)
8768 for (; flag
->name
!= NULL
; flag
++)
8770 if (length
== strlen (flag
->name
)
8771 && !strncmp (flag
->name
, token
, length
))
8775 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
8779 /* Parse OPTION which is a comma-separated list of flags to enable.
8780 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8781 default state we inherit from the CPU tuning structures. OPTION_NAME
8782 gives the top-level option we are parsing in the -moverride string,
8783 for use in error messages. */
8786 aarch64_parse_boolean_options (const char *option
,
8787 const struct aarch64_flag_desc
*flags
,
8788 unsigned int initial_state
,
8789 const char *option_name
)
8791 const char separator
= '.';
8792 const char* specs
= option
;
8793 const char* ntoken
= option
;
8794 unsigned int found_flags
= initial_state
;
8796 while ((ntoken
= strchr (specs
, separator
)))
8798 size_t token_length
= ntoken
- specs
;
8799 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8803 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8804 in the token stream, reset the supported operations. So:
8806 adrp+add.cmp+branch.none.adrp+add
8808 would have the result of turning on only adrp+add fusion. */
8812 found_flags
|= token_ops
;
8816 /* We ended with a comma, print something. */
8819 error ("%s string ill-formed\n", option_name
);
8823 /* We still have one more token to parse. */
8824 size_t token_length
= strlen (specs
);
8825 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
8832 found_flags
|= token_ops
;
8836 /* Support for overriding instruction fusion. */
8839 aarch64_parse_fuse_string (const char *fuse_string
,
8840 struct tune_params
*tune
)
8842 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
8843 aarch64_fusible_pairs
,
8848 /* Support for overriding other tuning flags. */
8851 aarch64_parse_tune_string (const char *tune_string
,
8852 struct tune_params
*tune
)
8854 tune
->extra_tuning_flags
8855 = aarch64_parse_boolean_options (tune_string
,
8856 aarch64_tuning_flags
,
8857 tune
->extra_tuning_flags
,
8861 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8862 we understand. If it is, extract the option string and handoff to
8863 the appropriate function. */
8866 aarch64_parse_one_override_token (const char* token
,
8868 struct tune_params
*tune
)
8870 const struct aarch64_tuning_override_function
*fn
8871 = aarch64_tuning_override_functions
;
8873 const char *option_part
= strchr (token
, '=');
8876 error ("tuning string missing in option (%s)", token
);
8880 /* Get the length of the option name. */
8881 length
= option_part
- token
;
8882 /* Skip the '=' to get to the option string. */
8885 for (; fn
->name
!= NULL
; fn
++)
8887 if (!strncmp (fn
->name
, token
, length
))
8889 fn
->parse_override (option_part
, tune
);
8894 error ("unknown tuning option (%s)",token
);
8898 /* A checking mechanism for the implementation of the tls size. */
8901 initialize_aarch64_tls_size (struct gcc_options
*opts
)
8903 if (aarch64_tls_size
== 0)
8904 aarch64_tls_size
= 24;
8906 switch (opts
->x_aarch64_cmodel_var
)
8908 case AARCH64_CMODEL_TINY
:
8909 /* Both the default and maximum TLS size allowed under tiny is 1M which
8910 needs two instructions to address, so we clamp the size to 24. */
8911 if (aarch64_tls_size
> 24)
8912 aarch64_tls_size
= 24;
8914 case AARCH64_CMODEL_SMALL
:
8915 /* The maximum TLS size allowed under small is 4G. */
8916 if (aarch64_tls_size
> 32)
8917 aarch64_tls_size
= 32;
8919 case AARCH64_CMODEL_LARGE
:
8920 /* The maximum TLS size allowed under large is 16E.
8921 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8922 if (aarch64_tls_size
> 48)
8923 aarch64_tls_size
= 48;
8932 /* Parse STRING looking for options in the format:
8933 string :: option:string
8934 option :: name=substring
8936 substring :: defined by option. */
8939 aarch64_parse_override_string (const char* input_string
,
8940 struct tune_params
* tune
)
8942 const char separator
= ':';
8943 size_t string_length
= strlen (input_string
) + 1;
8944 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
8945 char *string
= string_root
;
8946 strncpy (string
, input_string
, string_length
);
8947 string
[string_length
- 1] = '\0';
8949 char* ntoken
= string
;
8951 while ((ntoken
= strchr (string
, separator
)))
8953 size_t token_length
= ntoken
- string
;
8954 /* Make this substring look like a string. */
8956 aarch64_parse_one_override_token (string
, token_length
, tune
);
8960 /* One last option to parse. */
8961 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
8967 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
8969 /* PR 70044: We have to be careful about being called multiple times for the
8970 same function. This means all changes should be repeatable. */
8972 /* If the frame pointer is enabled, set it to a special value that behaves
8973 similar to frame pointer omission. If we don't do this all leaf functions
8974 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
8975 If flag_omit_frame_pointer has this special value, we must force the
8976 frame pointer if not in a leaf function. We also need to force it in a
8977 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
8978 if (opts
->x_flag_omit_frame_pointer
== 0)
8979 opts
->x_flag_omit_frame_pointer
= 2;
8981 /* If not optimizing for size, set the default
8982 alignment to what the target wants. */
8983 if (!opts
->x_optimize_size
)
8985 if (opts
->x_align_loops
<= 0)
8986 opts
->x_align_loops
= aarch64_tune_params
.loop_align
;
8987 if (opts
->x_align_jumps
<= 0)
8988 opts
->x_align_jumps
= aarch64_tune_params
.jump_align
;
8989 if (opts
->x_align_functions
<= 0)
8990 opts
->x_align_functions
= aarch64_tune_params
.function_align
;
8993 /* We default to no pc-relative literal loads. */
8995 aarch64_pcrelative_literal_loads
= false;
8997 /* If -mpc-relative-literal-loads is set on the command line, this
8998 implies that the user asked for PC relative literal loads. */
8999 if (opts
->x_pcrelative_literal_loads
== 1)
9000 aarch64_pcrelative_literal_loads
= true;
9002 /* In the tiny memory model it makes no sense to disallow PC relative
9003 literal pool loads. */
9004 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9005 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9006 aarch64_pcrelative_literal_loads
= true;
9008 /* When enabling the lower precision Newton series for the square root, also
9009 enable it for the reciprocal square root, since the latter is an
9010 intermediary step for the former. */
9011 if (flag_mlow_precision_sqrt
)
9012 flag_mrecip_low_precision_sqrt
= true;
9015 /* 'Unpack' up the internal tuning structs and update the options
9016 in OPTS. The caller must have set up selected_tune and selected_arch
9017 as all the other target-specific codegen decisions are
9018 derived from them. */
9021 aarch64_override_options_internal (struct gcc_options
*opts
)
9023 aarch64_tune_flags
= selected_tune
->flags
;
9024 aarch64_tune
= selected_tune
->sched_core
;
9025 /* Make a copy of the tuning parameters attached to the core, which
9026 we may later overwrite. */
9027 aarch64_tune_params
= *(selected_tune
->tune
);
9028 aarch64_architecture_version
= selected_arch
->architecture_version
;
9030 if (opts
->x_aarch64_override_tune_string
)
9031 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
9032 &aarch64_tune_params
);
9034 /* This target defaults to strict volatile bitfields. */
9035 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
9036 opts
->x_flag_strict_volatile_bitfields
= 1;
9038 initialize_aarch64_code_model (opts
);
9039 initialize_aarch64_tls_size (opts
);
9041 int queue_depth
= 0;
9042 switch (aarch64_tune_params
.autoprefetcher_model
)
9044 case tune_params::AUTOPREFETCHER_OFF
:
9047 case tune_params::AUTOPREFETCHER_WEAK
:
9050 case tune_params::AUTOPREFETCHER_STRONG
:
9051 queue_depth
= max_insn_queue_index
+ 1;
9057 /* We don't mind passing in global_options_set here as we don't use
9058 the *options_set structs anyway. */
9059 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
9061 opts
->x_param_values
,
9062 global_options_set
.x_param_values
);
9064 /* Set up parameters to be used in prefetching algorithm. Do not
9065 override the defaults unless we are tuning for a core we have
9066 researched values for. */
9067 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
9068 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
9069 aarch64_tune_params
.prefetch
->num_slots
,
9070 opts
->x_param_values
,
9071 global_options_set
.x_param_values
);
9072 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
9073 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
9074 aarch64_tune_params
.prefetch
->l1_cache_size
,
9075 opts
->x_param_values
,
9076 global_options_set
.x_param_values
);
9077 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
9078 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
9079 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
9080 opts
->x_param_values
,
9081 global_options_set
.x_param_values
);
9082 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
9083 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
9084 aarch64_tune_params
.prefetch
->l2_cache_size
,
9085 opts
->x_param_values
,
9086 global_options_set
.x_param_values
);
9088 /* Enable sw prefetching at specified optimization level for
9089 CPUS that have prefetch. Lower optimization level threshold by 1
9090 when profiling is enabled. */
9091 if (opts
->x_flag_prefetch_loop_arrays
< 0
9092 && !opts
->x_optimize_size
9093 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
9094 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
9095 opts
->x_flag_prefetch_loop_arrays
= 1;
9097 aarch64_override_options_after_change_1 (opts
);
9100 /* Print a hint with a suggestion for a core or architecture name that
9101 most closely resembles what the user passed in STR. ARCH is true if
9102 the user is asking for an architecture name. ARCH is false if the user
9103 is asking for a core name. */
9106 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
9108 auto_vec
<const char *> candidates
;
9109 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
9110 for (; entry
->name
!= NULL
; entry
++)
9111 candidates
.safe_push (entry
->name
);
9113 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
9115 inform (input_location
, "valid arguments are: %s;"
9116 " did you mean %qs?", s
, hint
);
9120 /* Print a hint with a suggestion for a core name that most closely resembles
9121 what the user passed in STR. */
9124 aarch64_print_hint_for_core (const char *str
)
9126 aarch64_print_hint_for_core_or_arch (str
, false);
9129 /* Print a hint with a suggestion for an architecture name that most closely
9130 resembles what the user passed in STR. */
9133 aarch64_print_hint_for_arch (const char *str
)
9135 aarch64_print_hint_for_core_or_arch (str
, true);
9138 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
9139 specified in STR and throw errors if appropriate. Put the results if
9140 they are valid in RES and ISA_FLAGS. Return whether the option is
9144 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
9145 unsigned long *isa_flags
)
9147 enum aarch64_parse_opt_result parse_res
9148 = aarch64_parse_cpu (str
, res
, isa_flags
);
9150 if (parse_res
== AARCH64_PARSE_OK
)
9155 case AARCH64_PARSE_MISSING_ARG
:
9156 error ("missing cpu name in %<-mcpu=%s%>", str
);
9158 case AARCH64_PARSE_INVALID_ARG
:
9159 error ("unknown value %qs for -mcpu", str
);
9160 aarch64_print_hint_for_core (str
);
9162 case AARCH64_PARSE_INVALID_FEATURE
:
9163 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
9172 /* Validate a command-line -march option. Parse the arch and extensions
9173 (if any) specified in STR and throw errors if appropriate. Put the
9174 results, if they are valid, in RES and ISA_FLAGS. Return whether the
9178 aarch64_validate_march (const char *str
, const struct processor
**res
,
9179 unsigned long *isa_flags
)
9181 enum aarch64_parse_opt_result parse_res
9182 = aarch64_parse_arch (str
, res
, isa_flags
);
9184 if (parse_res
== AARCH64_PARSE_OK
)
9189 case AARCH64_PARSE_MISSING_ARG
:
9190 error ("missing arch name in %<-march=%s%>", str
);
9192 case AARCH64_PARSE_INVALID_ARG
:
9193 error ("unknown value %qs for -march", str
);
9194 aarch64_print_hint_for_arch (str
);
9196 case AARCH64_PARSE_INVALID_FEATURE
:
9197 error ("invalid feature modifier in %<-march=%s%>", str
);
9206 /* Validate a command-line -mtune option. Parse the cpu
9207 specified in STR and throw errors if appropriate. Put the
9208 result, if it is valid, in RES. Return whether the option is
9212 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
9214 enum aarch64_parse_opt_result parse_res
9215 = aarch64_parse_tune (str
, res
);
9217 if (parse_res
== AARCH64_PARSE_OK
)
9222 case AARCH64_PARSE_MISSING_ARG
:
9223 error ("missing cpu name in %<-mtune=%s%>", str
);
9225 case AARCH64_PARSE_INVALID_ARG
:
9226 error ("unknown value %qs for -mtune", str
);
9227 aarch64_print_hint_for_core (str
);
9235 /* Return the CPU corresponding to the enum CPU.
9236 If it doesn't specify a cpu, return the default. */
9238 static const struct processor
*
9239 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
9241 if (cpu
!= aarch64_none
)
9242 return &all_cores
[cpu
];
9244 /* The & 0x3f is to extract the bottom 6 bits that encode the
9245 default cpu as selected by the --with-cpu GCC configure option
9247 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
9248 flags mechanism should be reworked to make it more sane. */
9249 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
9252 /* Return the architecture corresponding to the enum ARCH.
9253 If it doesn't specify a valid architecture, return the default. */
9255 static const struct processor
*
9256 aarch64_get_arch (enum aarch64_arch arch
)
9258 if (arch
!= aarch64_no_arch
)
9259 return &all_architectures
[arch
];
9261 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
9263 return &all_architectures
[cpu
->arch
];
9266 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
9267 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
9268 tuning structs. In particular it must set selected_tune and
9269 aarch64_isa_flags that define the available ISA features and tuning
9270 decisions. It must also set selected_arch as this will be used to
9271 output the .arch asm tags for each function. */
9274 aarch64_override_options (void)
9276 unsigned long cpu_isa
= 0;
9277 unsigned long arch_isa
= 0;
9278 aarch64_isa_flags
= 0;
9280 bool valid_cpu
= true;
9281 bool valid_tune
= true;
9282 bool valid_arch
= true;
9284 selected_cpu
= NULL
;
9285 selected_arch
= NULL
;
9286 selected_tune
= NULL
;
9288 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
9289 If either of -march or -mtune is given, they override their
9290 respective component of -mcpu. */
9291 if (aarch64_cpu_string
)
9292 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
9295 if (aarch64_arch_string
)
9296 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
9299 if (aarch64_tune_string
)
9300 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
9302 /* If the user did not specify a processor, choose the default
9303 one for them. This will be the CPU set during configuration using
9304 --with-cpu, otherwise it is "generic". */
9309 selected_cpu
= &all_cores
[selected_arch
->ident
];
9310 aarch64_isa_flags
= arch_isa
;
9311 explicit_arch
= selected_arch
->arch
;
9315 /* Get default configure-time CPU. */
9316 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
9317 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
9321 explicit_tune_core
= selected_tune
->ident
;
9323 /* If both -mcpu and -march are specified check that they are architecturally
9324 compatible, warn if they're not and prefer the -march ISA flags. */
9325 else if (selected_arch
)
9327 if (selected_arch
->arch
!= selected_cpu
->arch
)
9329 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
9330 all_architectures
[selected_cpu
->arch
].name
,
9331 selected_arch
->name
);
9333 aarch64_isa_flags
= arch_isa
;
9334 explicit_arch
= selected_arch
->arch
;
9335 explicit_tune_core
= selected_tune
? selected_tune
->ident
9336 : selected_cpu
->ident
;
9340 /* -mcpu but no -march. */
9341 aarch64_isa_flags
= cpu_isa
;
9342 explicit_tune_core
= selected_tune
? selected_tune
->ident
9343 : selected_cpu
->ident
;
9344 gcc_assert (selected_cpu
);
9345 selected_arch
= &all_architectures
[selected_cpu
->arch
];
9346 explicit_arch
= selected_arch
->arch
;
9349 /* Set the arch as well as we will need it when outputing
9350 the .arch directive in assembly. */
9353 gcc_assert (selected_cpu
);
9354 selected_arch
= &all_architectures
[selected_cpu
->arch
];
9358 selected_tune
= selected_cpu
;
9360 #ifndef HAVE_AS_MABI_OPTION
9361 /* The compiler may have been configured with 2.23.* binutils, which does
9362 not have support for ILP32. */
9364 error ("Assembler does not support -mabi=ilp32");
9367 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
9368 sorry ("Return address signing is only supported for -mabi=lp64");
9370 /* Make sure we properly set up the explicit options. */
9371 if ((aarch64_cpu_string
&& valid_cpu
)
9372 || (aarch64_tune_string
&& valid_tune
))
9373 gcc_assert (explicit_tune_core
!= aarch64_none
);
9375 if ((aarch64_cpu_string
&& valid_cpu
)
9376 || (aarch64_arch_string
&& valid_arch
))
9377 gcc_assert (explicit_arch
!= aarch64_no_arch
);
9379 aarch64_override_options_internal (&global_options
);
9381 /* Save these options as the default ones in case we push and pop them later
9382 while processing functions with potential target attributes. */
9383 target_option_default_node
= target_option_current_node
9384 = build_target_option_node (&global_options
);
9387 /* Implement targetm.override_options_after_change. */
9390 aarch64_override_options_after_change (void)
9392 aarch64_override_options_after_change_1 (&global_options
);
9395 static struct machine_function
*
9396 aarch64_init_machine_status (void)
9398 struct machine_function
*machine
;
9399 machine
= ggc_cleared_alloc
<machine_function
> ();
9404 aarch64_init_expanders (void)
9406 init_machine_status
= aarch64_init_machine_status
;
9409 /* A checking mechanism for the implementation of the various code models. */
9411 initialize_aarch64_code_model (struct gcc_options
*opts
)
9413 if (opts
->x_flag_pic
)
9415 switch (opts
->x_aarch64_cmodel_var
)
9417 case AARCH64_CMODEL_TINY
:
9418 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
9420 case AARCH64_CMODEL_SMALL
:
9421 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9422 aarch64_cmodel
= (flag_pic
== 2
9423 ? AARCH64_CMODEL_SMALL_PIC
9424 : AARCH64_CMODEL_SMALL_SPIC
);
9426 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
9429 case AARCH64_CMODEL_LARGE
:
9430 sorry ("code model %qs with -f%s", "large",
9431 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
9438 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
9441 /* Implement TARGET_OPTION_SAVE. */
9444 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
9446 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
9449 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9450 using the information saved in PTR. */
9453 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
9455 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
9456 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
9457 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
9458 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
9459 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
9461 aarch64_override_options_internal (opts
);
9464 /* Implement TARGET_OPTION_PRINT. */
9467 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
9469 const struct processor
*cpu
9470 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
9471 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
9472 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
9473 std::string extension
9474 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
9476 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
9477 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
9478 arch
->name
, extension
.c_str ());
9481 static GTY(()) tree aarch64_previous_fndecl
;
9484 aarch64_reset_previous_fndecl (void)
9486 aarch64_previous_fndecl
= NULL
;
9489 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9490 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9491 make sure optab availability predicates are recomputed when necessary. */
9494 aarch64_save_restore_target_globals (tree new_tree
)
9496 if (TREE_TARGET_GLOBALS (new_tree
))
9497 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
9498 else if (new_tree
== target_option_default_node
)
9499 restore_target_globals (&default_target_globals
);
9501 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
9504 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9505 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9506 of the function, if such exists. This function may be called multiple
9507 times on a single function so use aarch64_previous_fndecl to avoid
9508 setting up identical state. */
9511 aarch64_set_current_function (tree fndecl
)
9513 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
9516 tree old_tree
= (aarch64_previous_fndecl
9517 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
9520 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
9522 /* If current function has no attributes but the previous one did,
9523 use the default node. */
9524 if (!new_tree
&& old_tree
)
9525 new_tree
= target_option_default_node
;
9527 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9528 the default have been handled by aarch64_save_restore_target_globals from
9529 aarch64_pragma_target_parse. */
9530 if (old_tree
== new_tree
)
9533 aarch64_previous_fndecl
= fndecl
;
9535 /* First set the target options. */
9536 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
9538 aarch64_save_restore_target_globals (new_tree
);
9541 /* Enum describing the various ways we can handle attributes.
9542 In many cases we can reuse the generic option handling machinery. */
9544 enum aarch64_attr_opt_type
9546 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
9547 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
9548 aarch64_attr_enum
, /* Attribute sets an enum variable. */
9549 aarch64_attr_custom
/* Attribute requires a custom handling function. */
9552 /* All the information needed to handle a target attribute.
9553 NAME is the name of the attribute.
9554 ATTR_TYPE specifies the type of behavior of the attribute as described
9555 in the definition of enum aarch64_attr_opt_type.
9556 ALLOW_NEG is true if the attribute supports a "no-" form.
9557 HANDLER is the function that takes the attribute string and whether
9558 it is a pragma or attribute and handles the option. It is needed only
9559 when the ATTR_TYPE is aarch64_attr_custom.
9560 OPT_NUM is the enum specifying the option that the attribute modifies.
9561 This is needed for attributes that mirror the behavior of a command-line
9562 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9563 aarch64_attr_enum. */
9565 struct aarch64_attribute_info
9568 enum aarch64_attr_opt_type attr_type
;
9570 bool (*handler
) (const char *, const char *);
9571 enum opt_code opt_num
;
9574 /* Handle the ARCH_STR argument to the arch= target attribute.
9575 PRAGMA_OR_ATTR is used in potential error messages. */
9578 aarch64_handle_attr_arch (const char *str
, const char *pragma_or_attr
)
9580 const struct processor
*tmp_arch
= NULL
;
9581 enum aarch64_parse_opt_result parse_res
9582 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
9584 if (parse_res
== AARCH64_PARSE_OK
)
9586 gcc_assert (tmp_arch
);
9587 selected_arch
= tmp_arch
;
9588 explicit_arch
= selected_arch
->arch
;
9594 case AARCH64_PARSE_MISSING_ARG
:
9595 error ("missing architecture name in 'arch' target %s", pragma_or_attr
);
9597 case AARCH64_PARSE_INVALID_ARG
:
9598 error ("unknown value %qs for 'arch' target %s", str
, pragma_or_attr
);
9599 aarch64_print_hint_for_arch (str
);
9601 case AARCH64_PARSE_INVALID_FEATURE
:
9602 error ("invalid feature modifier %qs for 'arch' target %s",
9603 str
, pragma_or_attr
);
9612 /* Handle the argument CPU_STR to the cpu= target attribute.
9613 PRAGMA_OR_ATTR is used in potential error messages. */
9616 aarch64_handle_attr_cpu (const char *str
, const char *pragma_or_attr
)
9618 const struct processor
*tmp_cpu
= NULL
;
9619 enum aarch64_parse_opt_result parse_res
9620 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
9622 if (parse_res
== AARCH64_PARSE_OK
)
9624 gcc_assert (tmp_cpu
);
9625 selected_tune
= tmp_cpu
;
9626 explicit_tune_core
= selected_tune
->ident
;
9628 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
9629 explicit_arch
= selected_arch
->arch
;
9635 case AARCH64_PARSE_MISSING_ARG
:
9636 error ("missing cpu name in 'cpu' target %s", pragma_or_attr
);
9638 case AARCH64_PARSE_INVALID_ARG
:
9639 error ("unknown value %qs for 'cpu' target %s", str
, pragma_or_attr
);
9640 aarch64_print_hint_for_core (str
);
9642 case AARCH64_PARSE_INVALID_FEATURE
:
9643 error ("invalid feature modifier %qs for 'cpu' target %s",
9644 str
, pragma_or_attr
);
9653 /* Handle the argument STR to the tune= target attribute.
9654 PRAGMA_OR_ATTR is used in potential error messages. */
9657 aarch64_handle_attr_tune (const char *str
, const char *pragma_or_attr
)
9659 const struct processor
*tmp_tune
= NULL
;
9660 enum aarch64_parse_opt_result parse_res
9661 = aarch64_parse_tune (str
, &tmp_tune
);
9663 if (parse_res
== AARCH64_PARSE_OK
)
9665 gcc_assert (tmp_tune
);
9666 selected_tune
= tmp_tune
;
9667 explicit_tune_core
= selected_tune
->ident
;
9673 case AARCH64_PARSE_INVALID_ARG
:
9674 error ("unknown value %qs for 'tune' target %s", str
, pragma_or_attr
);
9675 aarch64_print_hint_for_core (str
);
9684 /* Parse an architecture extensions target attribute string specified in STR.
9685 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9686 if successful. Update aarch64_isa_flags to reflect the ISA features
9688 PRAGMA_OR_ATTR is used in potential error messages. */
9691 aarch64_handle_attr_isa_flags (char *str
, const char *pragma_or_attr
)
9693 enum aarch64_parse_opt_result parse_res
;
9694 unsigned long isa_flags
= aarch64_isa_flags
;
9696 /* We allow "+nothing" in the beginning to clear out all architectural
9697 features if the user wants to handpick specific features. */
9698 if (strncmp ("+nothing", str
, 8) == 0)
9704 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
9706 if (parse_res
== AARCH64_PARSE_OK
)
9708 aarch64_isa_flags
= isa_flags
;
9714 case AARCH64_PARSE_MISSING_ARG
:
9715 error ("missing feature modifier in target %s %qs",
9716 pragma_or_attr
, str
);
9719 case AARCH64_PARSE_INVALID_FEATURE
:
9720 error ("invalid feature modifier in target %s %qs",
9721 pragma_or_attr
, str
);
9731 /* The target attributes that we support. On top of these we also support just
9732 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9733 handled explicitly in aarch64_process_one_target_attr. */
9735 static const struct aarch64_attribute_info aarch64_attributes
[] =
9737 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
9738 OPT_mgeneral_regs_only
},
9739 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
9740 OPT_mfix_cortex_a53_835769
},
9741 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
9742 OPT_mfix_cortex_a53_843419
},
9743 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
9744 { "strict-align", aarch64_attr_mask
, false, NULL
, OPT_mstrict_align
},
9745 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
9746 OPT_momit_leaf_frame_pointer
},
9747 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
9748 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
9750 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
9751 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
9753 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
9754 OPT_msign_return_address_
},
9755 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
9758 /* Parse ARG_STR which contains the definition of one target attribute.
9759 Show appropriate errors if any or return true if the attribute is valid.
9760 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9761 we're processing a target attribute or pragma. */
9764 aarch64_process_one_target_attr (char *arg_str
, const char* pragma_or_attr
)
9766 bool invert
= false;
9768 size_t len
= strlen (arg_str
);
9772 error ("malformed target %s", pragma_or_attr
);
9776 char *str_to_check
= (char *) alloca (len
+ 1);
9777 strcpy (str_to_check
, arg_str
);
9779 /* Skip leading whitespace. */
9780 while (*str_to_check
== ' ' || *str_to_check
== '\t')
9783 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9784 It is easier to detect and handle it explicitly here rather than going
9785 through the machinery for the rest of the target attributes in this
9787 if (*str_to_check
== '+')
9788 return aarch64_handle_attr_isa_flags (str_to_check
, pragma_or_attr
);
9790 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
9795 char *arg
= strchr (str_to_check
, '=');
9797 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9798 and point ARG to "foo". */
9804 const struct aarch64_attribute_info
*p_attr
;
9806 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
9808 /* If the names don't match up, or the user has given an argument
9809 to an attribute that doesn't accept one, or didn't give an argument
9810 to an attribute that expects one, fail to match. */
9811 if (strcmp (str_to_check
, p_attr
->name
) != 0)
9815 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
9816 || p_attr
->attr_type
== aarch64_attr_enum
;
9818 if (attr_need_arg_p
^ (arg
!= NULL
))
9820 error ("target %s %qs does not accept an argument",
9821 pragma_or_attr
, str_to_check
);
9825 /* If the name matches but the attribute does not allow "no-" versions
9826 then we can't match. */
9827 if (invert
&& !p_attr
->allow_neg
)
9829 error ("target %s %qs does not allow a negated form",
9830 pragma_or_attr
, str_to_check
);
9834 switch (p_attr
->attr_type
)
9836 /* Has a custom handler registered.
9837 For example, cpu=, arch=, tune=. */
9838 case aarch64_attr_custom
:
9839 gcc_assert (p_attr
->handler
);
9840 if (!p_attr
->handler (arg
, pragma_or_attr
))
9844 /* Either set or unset a boolean option. */
9845 case aarch64_attr_bool
:
9847 struct cl_decoded_option decoded
;
9849 generate_option (p_attr
->opt_num
, NULL
, !invert
,
9850 CL_TARGET
, &decoded
);
9851 aarch64_handle_option (&global_options
, &global_options_set
,
9852 &decoded
, input_location
);
9855 /* Set or unset a bit in the target_flags. aarch64_handle_option
9856 should know what mask to apply given the option number. */
9857 case aarch64_attr_mask
:
9859 struct cl_decoded_option decoded
;
9860 /* We only need to specify the option number.
9861 aarch64_handle_option will know which mask to apply. */
9862 decoded
.opt_index
= p_attr
->opt_num
;
9863 decoded
.value
= !invert
;
9864 aarch64_handle_option (&global_options
, &global_options_set
,
9865 &decoded
, input_location
);
9868 /* Use the option setting machinery to set an option to an enum. */
9869 case aarch64_attr_enum
:
9874 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
9878 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
9879 NULL
, DK_UNSPECIFIED
, input_location
,
9884 error ("target %s %s=%s is not valid",
9885 pragma_or_attr
, str_to_check
, arg
);
9894 /* If we reached here we either have found an attribute and validated
9895 it or didn't match any. If we matched an attribute but its arguments
9896 were malformed we will have returned false already. */
9900 /* Count how many times the character C appears in
9901 NULL-terminated string STR. */
9904 num_occurences_in_str (char c
, char *str
)
9906 unsigned int res
= 0;
9907 while (*str
!= '\0')
9918 /* Parse the tree in ARGS that contains the target attribute information
9919 and update the global target options space. PRAGMA_OR_ATTR is a string
9920 to be used in error messages, specifying whether this is processing
9921 a target attribute or a target pragma. */
9924 aarch64_process_target_attr (tree args
, const char* pragma_or_attr
)
9926 if (TREE_CODE (args
) == TREE_LIST
)
9930 tree head
= TREE_VALUE (args
);
9933 if (!aarch64_process_target_attr (head
, pragma_or_attr
))
9936 args
= TREE_CHAIN (args
);
9942 if (TREE_CODE (args
) != STRING_CST
)
9944 error ("attribute %<target%> argument not a string");
9948 size_t len
= strlen (TREE_STRING_POINTER (args
));
9949 char *str_to_check
= (char *) alloca (len
+ 1);
9950 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
9954 error ("malformed target %s value", pragma_or_attr
);
9958 /* Used to catch empty spaces between commas i.e.
9959 attribute ((target ("attr1,,attr2"))). */
9960 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
9962 /* Handle multiple target attributes separated by ','. */
9963 char *token
= strtok (str_to_check
, ",");
9965 unsigned int num_attrs
= 0;
9969 if (!aarch64_process_one_target_attr (token
, pragma_or_attr
))
9971 error ("target %s %qs is invalid", pragma_or_attr
, token
);
9975 token
= strtok (NULL
, ",");
9978 if (num_attrs
!= num_commas
+ 1)
9980 error ("malformed target %s list %qs",
9981 pragma_or_attr
, TREE_STRING_POINTER (args
));
9988 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9989 process attribute ((target ("..."))). */
9992 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
9994 struct cl_target_option cur_target
;
9997 tree new_target
, new_optimize
;
9998 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
10000 /* If what we're processing is the current pragma string then the
10001 target option node is already stored in target_option_current_node
10002 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
10003 having to re-parse the string. This is especially useful to keep
10004 arm_neon.h compile times down since that header contains a lot
10005 of intrinsics enclosed in pragmas. */
10006 if (!existing_target
&& args
== current_target_pragma
)
10008 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
10011 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
10013 old_optimize
= build_optimization_node (&global_options
);
10014 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
10016 /* If the function changed the optimization levels as well as setting
10017 target options, start with the optimizations specified. */
10018 if (func_optimize
&& func_optimize
!= old_optimize
)
10019 cl_optimization_restore (&global_options
,
10020 TREE_OPTIMIZATION (func_optimize
));
10022 /* Save the current target options to restore at the end. */
10023 cl_target_option_save (&cur_target
, &global_options
);
10025 /* If fndecl already has some target attributes applied to it, unpack
10026 them so that we add this attribute on top of them, rather than
10027 overwriting them. */
10028 if (existing_target
)
10030 struct cl_target_option
*existing_options
10031 = TREE_TARGET_OPTION (existing_target
);
10033 if (existing_options
)
10034 cl_target_option_restore (&global_options
, existing_options
);
10037 cl_target_option_restore (&global_options
,
10038 TREE_TARGET_OPTION (target_option_current_node
));
10041 ret
= aarch64_process_target_attr (args
, "attribute");
10043 /* Set up any additional state. */
10046 aarch64_override_options_internal (&global_options
);
10047 /* Initialize SIMD builtins if we haven't already.
10048 Set current_target_pragma to NULL for the duration so that
10049 the builtin initialization code doesn't try to tag the functions
10050 being built with the attributes specified by any current pragma, thus
10051 going into an infinite recursion. */
10054 tree saved_current_target_pragma
= current_target_pragma
;
10055 current_target_pragma
= NULL
;
10056 aarch64_init_simd_builtins ();
10057 current_target_pragma
= saved_current_target_pragma
;
10059 new_target
= build_target_option_node (&global_options
);
10064 new_optimize
= build_optimization_node (&global_options
);
10068 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
10070 if (old_optimize
!= new_optimize
)
10071 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
10074 cl_target_option_restore (&global_options
, &cur_target
);
10076 if (old_optimize
!= new_optimize
)
10077 cl_optimization_restore (&global_options
,
10078 TREE_OPTIMIZATION (old_optimize
));
10082 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
10083 tri-bool options (yes, no, don't care) and the default value is
10084 DEF, determine whether to reject inlining. */
10087 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
10088 int dont_care
, int def
)
10090 /* If the callee doesn't care, always allow inlining. */
10091 if (callee
== dont_care
)
10094 /* If the caller doesn't care, always allow inlining. */
10095 if (caller
== dont_care
)
10098 /* Otherwise, allow inlining if either the callee and caller values
10099 agree, or if the callee is using the default value. */
10100 return (callee
== caller
|| callee
== def
);
10103 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
10104 to inline CALLEE into CALLER based on target-specific info.
10105 Make sure that the caller and callee have compatible architectural
10106 features. Then go through the other possible target attributes
10107 and see if they can block inlining. Try not to reject always_inline
10108 callees unless they are incompatible architecturally. */
10111 aarch64_can_inline_p (tree caller
, tree callee
)
10113 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
10114 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
10116 /* If callee has no option attributes, then it is ok to inline. */
10120 struct cl_target_option
*caller_opts
10121 = TREE_TARGET_OPTION (caller_tree
? caller_tree
10122 : target_option_default_node
);
10124 struct cl_target_option
*callee_opts
= TREE_TARGET_OPTION (callee_tree
);
10127 /* Callee's ISA flags should be a subset of the caller's. */
10128 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
10129 != callee_opts
->x_aarch64_isa_flags
)
10132 /* Allow non-strict aligned functions inlining into strict
10134 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
10135 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
10136 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
10137 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
10140 bool always_inline
= lookup_attribute ("always_inline",
10141 DECL_ATTRIBUTES (callee
));
10143 /* If the architectural features match up and the callee is always_inline
10144 then the other attributes don't matter. */
10148 if (caller_opts
->x_aarch64_cmodel_var
10149 != callee_opts
->x_aarch64_cmodel_var
)
10152 if (caller_opts
->x_aarch64_tls_dialect
10153 != callee_opts
->x_aarch64_tls_dialect
)
10156 /* Honour explicit requests to workaround errata. */
10157 if (!aarch64_tribools_ok_for_inlining_p (
10158 caller_opts
->x_aarch64_fix_a53_err835769
,
10159 callee_opts
->x_aarch64_fix_a53_err835769
,
10160 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
10163 if (!aarch64_tribools_ok_for_inlining_p (
10164 caller_opts
->x_aarch64_fix_a53_err843419
,
10165 callee_opts
->x_aarch64_fix_a53_err843419
,
10166 2, TARGET_FIX_ERR_A53_843419
))
10169 /* If the user explicitly specified -momit-leaf-frame-pointer for the
10170 caller and calle and they don't match up, reject inlining. */
10171 if (!aarch64_tribools_ok_for_inlining_p (
10172 caller_opts
->x_flag_omit_leaf_frame_pointer
,
10173 callee_opts
->x_flag_omit_leaf_frame_pointer
,
10177 /* If the callee has specific tuning overrides, respect them. */
10178 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
10179 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
10182 /* If the user specified tuning override strings for the
10183 caller and callee and they don't match up, reject inlining.
10184 We just do a string compare here, we don't analyze the meaning
10185 of the string, as it would be too costly for little gain. */
10186 if (callee_opts
->x_aarch64_override_tune_string
10187 && caller_opts
->x_aarch64_override_tune_string
10188 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
10189 caller_opts
->x_aarch64_override_tune_string
) != 0))
10195 /* Return true if SYMBOL_REF X binds locally. */
10198 aarch64_symbol_binds_local_p (const_rtx x
)
10200 return (SYMBOL_REF_DECL (x
)
10201 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
10202 : SYMBOL_REF_LOCAL_P (x
));
10205 /* Return true if SYMBOL_REF X is thread local */
10207 aarch64_tls_symbol_p (rtx x
)
10209 if (! TARGET_HAVE_TLS
)
10212 if (GET_CODE (x
) != SYMBOL_REF
)
10215 return SYMBOL_REF_TLS_MODEL (x
) != 0;
10218 /* Classify a TLS symbol into one of the TLS kinds. */
10219 enum aarch64_symbol_type
10220 aarch64_classify_tls_symbol (rtx x
)
10222 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
10226 case TLS_MODEL_GLOBAL_DYNAMIC
:
10227 case TLS_MODEL_LOCAL_DYNAMIC
:
10228 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
10230 case TLS_MODEL_INITIAL_EXEC
:
10231 switch (aarch64_cmodel
)
10233 case AARCH64_CMODEL_TINY
:
10234 case AARCH64_CMODEL_TINY_PIC
:
10235 return SYMBOL_TINY_TLSIE
;
10237 return SYMBOL_SMALL_TLSIE
;
10240 case TLS_MODEL_LOCAL_EXEC
:
10241 if (aarch64_tls_size
== 12)
10242 return SYMBOL_TLSLE12
;
10243 else if (aarch64_tls_size
== 24)
10244 return SYMBOL_TLSLE24
;
10245 else if (aarch64_tls_size
== 32)
10246 return SYMBOL_TLSLE32
;
10247 else if (aarch64_tls_size
== 48)
10248 return SYMBOL_TLSLE48
;
10250 gcc_unreachable ();
10252 case TLS_MODEL_EMULATED
:
10253 case TLS_MODEL_NONE
:
10254 return SYMBOL_FORCE_TO_MEM
;
10257 gcc_unreachable ();
10261 /* Return the method that should be used to access SYMBOL_REF or
10264 enum aarch64_symbol_type
10265 aarch64_classify_symbol (rtx x
, rtx offset
)
10267 if (GET_CODE (x
) == LABEL_REF
)
10269 switch (aarch64_cmodel
)
10271 case AARCH64_CMODEL_LARGE
:
10272 return SYMBOL_FORCE_TO_MEM
;
10274 case AARCH64_CMODEL_TINY_PIC
:
10275 case AARCH64_CMODEL_TINY
:
10276 return SYMBOL_TINY_ABSOLUTE
;
10278 case AARCH64_CMODEL_SMALL_SPIC
:
10279 case AARCH64_CMODEL_SMALL_PIC
:
10280 case AARCH64_CMODEL_SMALL
:
10281 return SYMBOL_SMALL_ABSOLUTE
;
10284 gcc_unreachable ();
10288 if (GET_CODE (x
) == SYMBOL_REF
)
10290 if (aarch64_tls_symbol_p (x
))
10291 return aarch64_classify_tls_symbol (x
);
10293 switch (aarch64_cmodel
)
10295 case AARCH64_CMODEL_TINY
:
10296 /* When we retrieve symbol + offset address, we have to make sure
10297 the offset does not cause overflow of the final address. But
10298 we have no way of knowing the address of symbol at compile time
10299 so we can't accurately say if the distance between the PC and
10300 symbol + offset is outside the addressible range of +/-1M in the
10301 TINY code model. So we rely on images not being greater than
10302 1M and cap the offset at 1M and anything beyond 1M will have to
10303 be loaded using an alternative mechanism. Furthermore if the
10304 symbol is a weak reference to something that isn't known to
10305 resolve to a symbol in this module, then force to memory. */
10306 if ((SYMBOL_REF_WEAK (x
)
10307 && !aarch64_symbol_binds_local_p (x
))
10308 || INTVAL (offset
) < -1048575 || INTVAL (offset
) > 1048575)
10309 return SYMBOL_FORCE_TO_MEM
;
10310 return SYMBOL_TINY_ABSOLUTE
;
10312 case AARCH64_CMODEL_SMALL
:
10313 /* Same reasoning as the tiny code model, but the offset cap here is
10315 if ((SYMBOL_REF_WEAK (x
)
10316 && !aarch64_symbol_binds_local_p (x
))
10317 || !IN_RANGE (INTVAL (offset
), HOST_WIDE_INT_C (-4294967263),
10318 HOST_WIDE_INT_C (4294967264)))
10319 return SYMBOL_FORCE_TO_MEM
;
10320 return SYMBOL_SMALL_ABSOLUTE
;
10322 case AARCH64_CMODEL_TINY_PIC
:
10323 if (!aarch64_symbol_binds_local_p (x
))
10324 return SYMBOL_TINY_GOT
;
10325 return SYMBOL_TINY_ABSOLUTE
;
10327 case AARCH64_CMODEL_SMALL_SPIC
:
10328 case AARCH64_CMODEL_SMALL_PIC
:
10329 if (!aarch64_symbol_binds_local_p (x
))
10330 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
10331 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
10332 return SYMBOL_SMALL_ABSOLUTE
;
10334 case AARCH64_CMODEL_LARGE
:
10335 /* This is alright even in PIC code as the constant
10336 pool reference is always PC relative and within
10337 the same translation unit. */
10338 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
10339 return SYMBOL_SMALL_ABSOLUTE
;
10341 return SYMBOL_FORCE_TO_MEM
;
10344 gcc_unreachable ();
10348 /* By default push everything into the constant pool. */
10349 return SYMBOL_FORCE_TO_MEM
;
10353 aarch64_constant_address_p (rtx x
)
10355 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
10359 aarch64_legitimate_pic_operand_p (rtx x
)
10361 if (GET_CODE (x
) == SYMBOL_REF
10362 || (GET_CODE (x
) == CONST
10363 && GET_CODE (XEXP (x
, 0)) == PLUS
10364 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
10370 /* Return true if X holds either a quarter-precision or
10371 floating-point +0.0 constant. */
10373 aarch64_valid_floating_const (rtx x
)
10375 if (!CONST_DOUBLE_P (x
))
10378 /* This call determines which constants can be used in mov<mode>
10379 as integer moves instead of constant loads. */
10380 if (aarch64_float_const_rtx_p (x
))
10383 return aarch64_float_const_representable_p (x
);
10387 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
10389 /* Do not allow vector struct mode constants. We could support
10390 0 and -1 easily, but they need support in aarch64-simd.md. */
10391 if (TARGET_SIMD
&& aarch64_vect_struct_mode_p (mode
))
10394 /* For these cases we never want to use a literal load.
10395 As such we have to prevent the compiler from forcing these
10397 if ((GET_CODE (x
) == CONST_VECTOR
10398 && aarch64_simd_valid_immediate (x
, mode
, false, NULL
))
10400 || aarch64_valid_floating_const (x
)
10401 || aarch64_can_const_movi_rtx_p (x
, mode
)
10402 || aarch64_float_const_rtx_p (x
))
10403 return !targetm
.cannot_force_const_mem (mode
, x
);
10405 if (GET_CODE (x
) == HIGH
10406 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
10409 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
10410 so spilling them is better than rematerialization. */
10411 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
10414 return aarch64_constant_address_p (x
);
10418 aarch64_load_tp (rtx target
)
10421 || GET_MODE (target
) != Pmode
10422 || !register_operand (target
, Pmode
))
10423 target
= gen_reg_rtx (Pmode
);
10425 /* Can return in any reg. */
10426 emit_insn (gen_aarch64_load_tp_hard (target
));
10430 /* On AAPCS systems, this is the "struct __va_list". */
10431 static GTY(()) tree va_list_type
;
10433 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10434 Return the type to use as __builtin_va_list.
10436 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10448 aarch64_build_builtin_va_list (void)
10451 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10453 /* Create the type. */
10454 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
10455 /* Give it the required name. */
10456 va_list_name
= build_decl (BUILTINS_LOCATION
,
10458 get_identifier ("__va_list"),
10460 DECL_ARTIFICIAL (va_list_name
) = 1;
10461 TYPE_NAME (va_list_type
) = va_list_name
;
10462 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
10464 /* Create the fields. */
10465 f_stack
= build_decl (BUILTINS_LOCATION
,
10466 FIELD_DECL
, get_identifier ("__stack"),
10468 f_grtop
= build_decl (BUILTINS_LOCATION
,
10469 FIELD_DECL
, get_identifier ("__gr_top"),
10471 f_vrtop
= build_decl (BUILTINS_LOCATION
,
10472 FIELD_DECL
, get_identifier ("__vr_top"),
10474 f_groff
= build_decl (BUILTINS_LOCATION
,
10475 FIELD_DECL
, get_identifier ("__gr_offs"),
10476 integer_type_node
);
10477 f_vroff
= build_decl (BUILTINS_LOCATION
,
10478 FIELD_DECL
, get_identifier ("__vr_offs"),
10479 integer_type_node
);
10481 /* Tell tree-stdarg pass about our internal offset fields.
10482 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10483 purpose to identify whether the code is updating va_list internal
10484 offset fields through irregular way. */
10485 va_list_gpr_counter_field
= f_groff
;
10486 va_list_fpr_counter_field
= f_vroff
;
10488 DECL_ARTIFICIAL (f_stack
) = 1;
10489 DECL_ARTIFICIAL (f_grtop
) = 1;
10490 DECL_ARTIFICIAL (f_vrtop
) = 1;
10491 DECL_ARTIFICIAL (f_groff
) = 1;
10492 DECL_ARTIFICIAL (f_vroff
) = 1;
10494 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
10495 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
10496 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
10497 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
10498 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
10500 TYPE_FIELDS (va_list_type
) = f_stack
;
10501 DECL_CHAIN (f_stack
) = f_grtop
;
10502 DECL_CHAIN (f_grtop
) = f_vrtop
;
10503 DECL_CHAIN (f_vrtop
) = f_groff
;
10504 DECL_CHAIN (f_groff
) = f_vroff
;
10506 /* Compute its layout. */
10507 layout_type (va_list_type
);
10509 return va_list_type
;
10512 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10514 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
10516 const CUMULATIVE_ARGS
*cum
;
10517 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10518 tree stack
, grtop
, vrtop
, groff
, vroff
;
10520 int gr_save_area_size
= cfun
->va_list_gpr_size
;
10521 int vr_save_area_size
= cfun
->va_list_fpr_size
;
10524 cum
= &crtl
->args
.info
;
10525 if (cfun
->va_list_gpr_size
)
10526 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
10527 cfun
->va_list_gpr_size
);
10528 if (cfun
->va_list_fpr_size
)
10529 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
10530 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
10534 gcc_assert (cum
->aapcs_nvrn
== 0);
10535 vr_save_area_size
= 0;
10538 f_stack
= TYPE_FIELDS (va_list_type_node
);
10539 f_grtop
= DECL_CHAIN (f_stack
);
10540 f_vrtop
= DECL_CHAIN (f_grtop
);
10541 f_groff
= DECL_CHAIN (f_vrtop
);
10542 f_vroff
= DECL_CHAIN (f_groff
);
10544 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
10546 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
10548 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
10550 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
10552 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
10555 /* Emit code to initialize STACK, which points to the next varargs stack
10556 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10557 by named arguments. STACK is 8-byte aligned. */
10558 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
10559 if (cum
->aapcs_stack_size
> 0)
10560 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
10561 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
10562 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10564 /* Emit code to initialize GRTOP, the top of the GR save area.
10565 virtual_incoming_args_rtx should have been 16 byte aligned. */
10566 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
10567 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
10568 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10570 /* Emit code to initialize VRTOP, the top of the VR save area.
10571 This address is gr_save_area_bytes below GRTOP, rounded
10572 down to the next 16-byte boundary. */
10573 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
10574 vr_offset
= ROUND_UP (gr_save_area_size
,
10575 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10578 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
10579 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
10580 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10582 /* Emit code to initialize GROFF, the offset from GRTOP of the
10583 next GPR argument. */
10584 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
10585 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
10586 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10588 /* Likewise emit code to initialize VROFF, the offset from FTOP
10589 of the next VR argument. */
10590 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
10591 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
10592 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
10595 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10598 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
10599 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
10603 bool is_ha
; /* is HFA or HVA. */
10604 bool dw_align
; /* double-word align. */
10605 machine_mode ag_mode
= VOIDmode
;
10609 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
10610 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
10611 HOST_WIDE_INT size
, rsize
, adjust
, align
;
10612 tree t
, u
, cond1
, cond2
;
10614 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
10616 type
= build_pointer_type (type
);
10618 mode
= TYPE_MODE (type
);
10620 f_stack
= TYPE_FIELDS (va_list_type_node
);
10621 f_grtop
= DECL_CHAIN (f_stack
);
10622 f_vrtop
= DECL_CHAIN (f_grtop
);
10623 f_groff
= DECL_CHAIN (f_vrtop
);
10624 f_vroff
= DECL_CHAIN (f_groff
);
10626 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
10627 f_stack
, NULL_TREE
);
10628 size
= int_size_in_bytes (type
);
10629 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
10633 if (aarch64_vfp_is_call_or_return_candidate (mode
,
10639 /* TYPE passed in fp/simd registers. */
10641 aarch64_err_no_fpadvsimd (mode
, "varargs");
10643 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
10644 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
10645 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
10646 unshare_expr (valist
), f_vroff
, NULL_TREE
);
10648 rsize
= nregs
* UNITS_PER_VREG
;
10652 if (BYTES_BIG_ENDIAN
&& GET_MODE_SIZE (ag_mode
) < UNITS_PER_VREG
)
10653 adjust
= UNITS_PER_VREG
- GET_MODE_SIZE (ag_mode
);
10655 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
10656 && size
< UNITS_PER_VREG
)
10658 adjust
= UNITS_PER_VREG
- size
;
10663 /* TYPE passed in general registers. */
10664 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
10665 unshare_expr (valist
), f_grtop
, NULL_TREE
);
10666 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
10667 unshare_expr (valist
), f_groff
, NULL_TREE
);
10668 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
10669 nregs
= rsize
/ UNITS_PER_WORD
;
10674 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
10675 && size
< UNITS_PER_WORD
)
10677 adjust
= UNITS_PER_WORD
- size
;
10681 /* Get a local temporary for the field value. */
10682 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
10684 /* Emit code to branch if off >= 0. */
10685 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
10686 build_int_cst (TREE_TYPE (off
), 0));
10687 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
10691 /* Emit: offs = (offs + 15) & -16. */
10692 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
10693 build_int_cst (TREE_TYPE (off
), 15));
10694 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
10695 build_int_cst (TREE_TYPE (off
), -16));
10696 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
10701 /* Update ap.__[g|v]r_offs */
10702 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
10703 build_int_cst (TREE_TYPE (off
), rsize
));
10704 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
10708 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
10710 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10711 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
10712 build_int_cst (TREE_TYPE (f_off
), 0));
10713 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
10715 /* String up: make sure the assignment happens before the use. */
10716 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
10717 COND_EXPR_ELSE (cond1
) = t
;
10719 /* Prepare the trees handling the argument that is passed on the stack;
10720 the top level node will store in ON_STACK. */
10721 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
10724 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10725 t
= fold_convert (intDI_type_node
, arg
);
10726 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
10727 build_int_cst (TREE_TYPE (t
), 15));
10728 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
10729 build_int_cst (TREE_TYPE (t
), -16));
10730 t
= fold_convert (TREE_TYPE (arg
), t
);
10731 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
10735 /* Advance ap.__stack */
10736 t
= fold_convert (intDI_type_node
, arg
);
10737 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
10738 build_int_cst (TREE_TYPE (t
), size
+ 7));
10739 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
10740 build_int_cst (TREE_TYPE (t
), -8));
10741 t
= fold_convert (TREE_TYPE (arg
), t
);
10742 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
10743 /* String up roundup and advance. */
10745 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
10746 /* String up with arg */
10747 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
10748 /* Big-endianness related address adjustment. */
10749 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
10750 && size
< UNITS_PER_WORD
)
10752 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
10753 size_int (UNITS_PER_WORD
- size
));
10754 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
10757 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
10758 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
10760 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10763 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
10764 build_int_cst (TREE_TYPE (off
), adjust
));
10766 t
= fold_convert (sizetype
, t
);
10767 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
10771 /* type ha; // treat as "struct {ftype field[n];}"
10772 ... [computing offs]
10773 for (i = 0; i <nregs; ++i, offs += 16)
10774 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10777 tree tmp_ha
, field_t
, field_ptr_t
;
10779 /* Declare a local variable. */
10780 tmp_ha
= create_tmp_var_raw (type
, "ha");
10781 gimple_add_tmp_var (tmp_ha
);
10783 /* Establish the base type. */
10787 field_t
= float_type_node
;
10788 field_ptr_t
= float_ptr_type_node
;
10791 field_t
= double_type_node
;
10792 field_ptr_t
= double_ptr_type_node
;
10795 field_t
= long_double_type_node
;
10796 field_ptr_t
= long_double_ptr_type_node
;
10799 field_t
= aarch64_fp16_type_node
;
10800 field_ptr_t
= aarch64_fp16_ptr_type_node
;
10805 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
10806 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
10807 field_ptr_t
= build_pointer_type (field_t
);
10814 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10815 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
10817 t
= fold_convert (field_ptr_t
, addr
);
10818 t
= build2 (MODIFY_EXPR
, field_t
,
10819 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
10820 build1 (INDIRECT_REF
, field_t
, t
));
10822 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10823 for (i
= 1; i
< nregs
; ++i
)
10825 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
10826 u
= fold_convert (field_ptr_t
, addr
);
10827 u
= build2 (MODIFY_EXPR
, field_t
,
10828 build2 (MEM_REF
, field_t
, tmp_ha
,
10829 build_int_cst (field_ptr_t
,
10831 int_size_in_bytes (field_t
)))),
10832 build1 (INDIRECT_REF
, field_t
, u
));
10833 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
10836 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
10837 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
10840 COND_EXPR_ELSE (cond2
) = t
;
10841 addr
= fold_convert (build_pointer_type (type
), cond1
);
10842 addr
= build_va_arg_indirect_ref (addr
);
10845 addr
= build_va_arg_indirect_ref (addr
);
10850 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10853 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
10854 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
10857 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
10858 CUMULATIVE_ARGS local_cum
;
10859 int gr_saved
= cfun
->va_list_gpr_size
;
10860 int vr_saved
= cfun
->va_list_fpr_size
;
10862 /* The caller has advanced CUM up to, but not beyond, the last named
10863 argument. Advance a local copy of CUM past the last "real" named
10864 argument, to find out how many registers are left over. */
10866 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
10868 /* Found out how many registers we need to save.
10869 Honor tree-stdvar analysis results. */
10870 if (cfun
->va_list_gpr_size
)
10871 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
10872 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
10873 if (cfun
->va_list_fpr_size
)
10874 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
10875 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
10879 gcc_assert (local_cum
.aapcs_nvrn
== 0);
10889 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10890 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
10891 - gr_saved
* UNITS_PER_WORD
);
10892 mem
= gen_frame_mem (BLKmode
, ptr
);
10893 set_mem_alias_set (mem
, get_varargs_alias_set ());
10895 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
10900 /* We can't use move_block_from_reg, because it will use
10901 the wrong mode, storing D regs only. */
10902 machine_mode mode
= TImode
;
10903 int off
, i
, vr_start
;
10905 /* Set OFF to the offset from virtual_incoming_args_rtx of
10906 the first vector register. The VR save area lies below
10907 the GR one, and is aligned to 16 bytes. */
10908 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10909 STACK_BOUNDARY
/ BITS_PER_UNIT
);
10910 off
-= vr_saved
* UNITS_PER_VREG
;
10912 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
10913 for (i
= 0; i
< vr_saved
; ++i
)
10917 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
10918 mem
= gen_frame_mem (mode
, ptr
);
10919 set_mem_alias_set (mem
, get_varargs_alias_set ());
10920 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
10921 off
+= UNITS_PER_VREG
;
10926 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10927 any complication of having crtl->args.pretend_args_size changed. */
10928 cfun
->machine
->frame
.saved_varargs_size
10929 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
10930 STACK_BOUNDARY
/ BITS_PER_UNIT
)
10931 + vr_saved
* UNITS_PER_VREG
);
10935 aarch64_conditional_register_usage (void)
10940 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
10943 call_used_regs
[i
] = 1;
10948 /* Walk down the type tree of TYPE counting consecutive base elements.
10949 If *MODEP is VOIDmode, then set it to the first valid floating point
10950 type. If a non-floating point type is found, or if a floating point
10951 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10952 otherwise return the count in the sub-tree. */
10954 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
10957 HOST_WIDE_INT size
;
10959 switch (TREE_CODE (type
))
10962 mode
= TYPE_MODE (type
);
10963 if (mode
!= DFmode
&& mode
!= SFmode
10964 && mode
!= TFmode
&& mode
!= HFmode
)
10967 if (*modep
== VOIDmode
)
10970 if (*modep
== mode
)
10976 mode
= TYPE_MODE (TREE_TYPE (type
));
10977 if (mode
!= DFmode
&& mode
!= SFmode
10978 && mode
!= TFmode
&& mode
!= HFmode
)
10981 if (*modep
== VOIDmode
)
10984 if (*modep
== mode
)
10990 /* Use V2SImode and V4SImode as representatives of all 64-bit
10991 and 128-bit vector types. */
10992 size
= int_size_in_bytes (type
);
11005 if (*modep
== VOIDmode
)
11008 /* Vector modes are considered to be opaque: two vectors are
11009 equivalent for the purposes of being homogeneous aggregates
11010 if they are the same size. */
11011 if (*modep
== mode
)
11019 tree index
= TYPE_DOMAIN (type
);
11021 /* Can't handle incomplete types nor sizes that are not
11023 if (!COMPLETE_TYPE_P (type
)
11024 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
11027 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
11030 || !TYPE_MAX_VALUE (index
)
11031 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
11032 || !TYPE_MIN_VALUE (index
)
11033 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
11037 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
11038 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
11040 /* There must be no padding. */
11041 if (wi::to_wide (TYPE_SIZE (type
))
11042 != count
* GET_MODE_BITSIZE (*modep
))
11054 /* Can't handle incomplete types nor sizes that are not
11056 if (!COMPLETE_TYPE_P (type
)
11057 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
11060 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
11062 if (TREE_CODE (field
) != FIELD_DECL
)
11065 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
11068 count
+= sub_count
;
11071 /* There must be no padding. */
11072 if (wi::to_wide (TYPE_SIZE (type
))
11073 != count
* GET_MODE_BITSIZE (*modep
))
11080 case QUAL_UNION_TYPE
:
11082 /* These aren't very interesting except in a degenerate case. */
11087 /* Can't handle incomplete types nor sizes that are not
11089 if (!COMPLETE_TYPE_P (type
)
11090 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
11093 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
11095 if (TREE_CODE (field
) != FIELD_DECL
)
11098 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
11101 count
= count
> sub_count
? count
: sub_count
;
11104 /* There must be no padding. */
11105 if (wi::to_wide (TYPE_SIZE (type
))
11106 != count
* GET_MODE_BITSIZE (*modep
))
11119 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
11120 type as described in AAPCS64 \S 4.1.2.
11122 See the comment above aarch64_composite_type_p for the notes on MODE. */
11125 aarch64_short_vector_p (const_tree type
,
11128 HOST_WIDE_INT size
= -1;
11130 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
11131 size
= int_size_in_bytes (type
);
11132 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
11133 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
11134 size
= GET_MODE_SIZE (mode
);
11136 return (size
== 8 || size
== 16);
11139 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
11140 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
11141 array types. The C99 floating-point complex types are also considered
11142 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
11143 types, which are GCC extensions and out of the scope of AAPCS64, are
11144 treated as composite types here as well.
11146 Note that MODE itself is not sufficient in determining whether a type
11147 is such a composite type or not. This is because
11148 stor-layout.c:compute_record_mode may have already changed the MODE
11149 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
11150 structure with only one field may have its MODE set to the mode of the
11151 field. Also an integer mode whose size matches the size of the
11152 RECORD_TYPE type may be used to substitute the original mode
11153 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
11154 solely relied on. */
11157 aarch64_composite_type_p (const_tree type
,
11160 if (aarch64_short_vector_p (type
, mode
))
11163 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
11166 if (mode
== BLKmode
11167 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
11168 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
11174 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
11175 shall be passed or returned in simd/fp register(s) (providing these
11176 parameter passing registers are available).
11178 Upon successful return, *COUNT returns the number of needed registers,
11179 *BASE_MODE returns the mode of the individual register and when IS_HAF
11180 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
11181 floating-point aggregate or a homogeneous short-vector aggregate. */
11184 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
11186 machine_mode
*base_mode
,
11190 machine_mode new_mode
= VOIDmode
;
11191 bool composite_p
= aarch64_composite_type_p (type
, mode
);
11193 if (is_ha
!= NULL
) *is_ha
= false;
11195 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11196 || aarch64_short_vector_p (type
, mode
))
11201 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
11203 if (is_ha
!= NULL
) *is_ha
= true;
11205 new_mode
= GET_MODE_INNER (mode
);
11207 else if (type
&& composite_p
)
11209 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
11211 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
11213 if (is_ha
!= NULL
) *is_ha
= true;
11222 *base_mode
= new_mode
;
11226 /* Implement TARGET_STRUCT_VALUE_RTX. */
11229 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
11230 int incoming ATTRIBUTE_UNUSED
)
11232 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
11235 /* Implements target hook vector_mode_supported_p. */
11237 aarch64_vector_mode_supported_p (machine_mode mode
)
11240 && (mode
== V4SImode
|| mode
== V8HImode
11241 || mode
== V16QImode
|| mode
== V2DImode
11242 || mode
== V2SImode
|| mode
== V4HImode
11243 || mode
== V8QImode
|| mode
== V2SFmode
11244 || mode
== V4SFmode
|| mode
== V2DFmode
11245 || mode
== V4HFmode
|| mode
== V8HFmode
11246 || mode
== V1DFmode
))
11252 /* Return appropriate SIMD container
11253 for MODE within a vector of WIDTH bits. */
11254 static machine_mode
11255 aarch64_simd_container_mode (scalar_mode mode
, unsigned width
)
11257 gcc_assert (width
== 64 || width
== 128);
11300 /* Return 128-bit container as the preferred SIMD mode for MODE. */
11301 static machine_mode
11302 aarch64_preferred_simd_mode (scalar_mode mode
)
11304 return aarch64_simd_container_mode (mode
, 128);
11307 /* Return the bitmask of possible vector sizes for the vectorizer
11308 to iterate over. */
11309 static unsigned int
11310 aarch64_autovectorize_vector_sizes (void)
11315 /* Implement TARGET_MANGLE_TYPE. */
11317 static const char *
11318 aarch64_mangle_type (const_tree type
)
11320 /* The AArch64 ABI documents say that "__va_list" has to be
11321 managled as if it is in the "std" namespace. */
11322 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
11323 return "St9__va_list";
11325 /* Half-precision float. */
11326 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
11329 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
11331 if (TYPE_NAME (type
) != NULL
)
11332 return aarch64_mangle_builtin_type (type
);
11334 /* Use the default mangling. */
11338 /* Find the first rtx_insn before insn that will generate an assembly
11342 aarch64_prev_real_insn (rtx_insn
*insn
)
11349 insn
= prev_real_insn (insn
);
11351 while (insn
&& recog_memoized (insn
) < 0);
11357 is_madd_op (enum attr_type t1
)
11360 /* A number of these may be AArch32 only. */
11361 enum attr_type mlatypes
[] = {
11362 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
11363 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
11364 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
11367 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
11369 if (t1
== mlatypes
[i
])
11376 /* Check if there is a register dependency between a load and the insn
11377 for which we hold recog_data. */
11380 dep_between_memop_and_curr (rtx memop
)
11385 gcc_assert (GET_CODE (memop
) == SET
);
11387 if (!REG_P (SET_DEST (memop
)))
11390 load_reg
= SET_DEST (memop
);
11391 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
11393 rtx operand
= recog_data
.operand
[opno
];
11394 if (REG_P (operand
)
11395 && reg_overlap_mentioned_p (load_reg
, operand
))
11403 /* When working around the Cortex-A53 erratum 835769,
11404 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11405 instruction and has a preceding memory instruction such that a NOP
11406 should be inserted between them. */
11409 aarch64_madd_needs_nop (rtx_insn
* insn
)
11411 enum attr_type attr_type
;
11415 if (!TARGET_FIX_ERR_A53_835769
)
11418 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
11421 attr_type
= get_attr_type (insn
);
11422 if (!is_madd_op (attr_type
))
11425 prev
= aarch64_prev_real_insn (insn
);
11426 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11427 Restore recog state to INSN to avoid state corruption. */
11428 extract_constrain_insn_cached (insn
);
11430 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
11433 body
= single_set (prev
);
11435 /* If the previous insn is a memory op and there is no dependency between
11436 it and the DImode madd, emit a NOP between them. If body is NULL then we
11437 have a complex memory operation, probably a load/store pair.
11438 Be conservative for now and emit a NOP. */
11439 if (GET_MODE (recog_data
.operand
[0]) == DImode
11440 && (!body
|| !dep_between_memop_and_curr (body
)))
11448 /* Implement FINAL_PRESCAN_INSN. */
11451 aarch64_final_prescan_insn (rtx_insn
*insn
)
11453 if (aarch64_madd_needs_nop (insn
))
11454 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
11458 /* Return the equivalent letter for size. */
11460 sizetochar (int size
)
11464 case 64: return 'd';
11465 case 32: return 's';
11466 case 16: return 'h';
11467 case 8 : return 'b';
11468 default: gcc_unreachable ();
11472 /* Return true iff x is a uniform vector of floating-point
11473 constants, and the constant can be represented in
11474 quarter-precision form. Note, as aarch64_float_const_representable
11475 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11477 aarch64_vect_float_const_representable_p (rtx x
)
11480 return (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
11481 && const_vec_duplicate_p (x
, &elt
)
11482 && aarch64_float_const_representable_p (elt
));
11485 /* Return true for valid and false for invalid. */
11487 aarch64_simd_valid_immediate (rtx op
, machine_mode mode
, bool inverse
,
11488 struct simd_immediate_info
*info
,
11489 enum simd_immediate_check which
)
11491 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11493 for (i = 0; i < idx; i += (STRIDE)) \
11498 immtype = (CLASS); \
11499 elsize = (ELSIZE); \
11500 eshift = (SHIFT); \
11505 unsigned int i
, elsize
= 0, idx
= 0, n_elts
= CONST_VECTOR_NUNITS (op
);
11506 unsigned int innersize
= GET_MODE_UNIT_SIZE (mode
);
11507 unsigned char bytes
[16];
11508 int immtype
= -1, matches
;
11509 unsigned int invmask
= inverse
? 0xff : 0;
11512 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
11514 if (! (aarch64_simd_imm_zero_p (op
, mode
)
11515 || aarch64_vect_float_const_representable_p (op
)))
11520 rtx elt
= CONST_VECTOR_ELT (op
, 0);
11521 scalar_float_mode elt_mode
11522 = as_a
<scalar_float_mode
> (GET_MODE (elt
));
11525 info
->element_width
= GET_MODE_BITSIZE (elt_mode
);
11533 /* Splat vector constant out into a byte vector. */
11534 for (i
= 0; i
< n_elts
; i
++)
11536 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11537 it must be laid out in the vector register in reverse order. */
11538 rtx el
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
11539 unsigned HOST_WIDE_INT elpart
;
11541 gcc_assert (CONST_INT_P (el
));
11542 elpart
= INTVAL (el
);
11544 for (unsigned int byte
= 0; byte
< innersize
; byte
++)
11546 bytes
[idx
++] = (elpart
& 0xff) ^ invmask
;
11547 elpart
>>= BITS_PER_UNIT
;
11552 /* Sanity check. */
11553 gcc_assert (idx
== GET_MODE_SIZE (mode
));
11557 if (which
& AARCH64_CHECK_ORR
)
11559 CHECK (4, 32, 0, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0
11560 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 0, 0);
11562 CHECK (4, 32, 1, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
11563 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
11565 CHECK (4, 32, 2, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11566 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
11568 CHECK (4, 32, 3, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11569 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == bytes
[3], 24, 0);
11571 CHECK (2, 16, 4, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0, 0, 0);
11573 CHECK (2, 16, 5, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1], 8, 0);
11576 if (which
& AARCH64_CHECK_BIC
)
11578 CHECK (4, 32, 6, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff
11579 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 0, 1);
11581 CHECK (4, 32, 7, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
11582 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
11584 CHECK (4, 32, 8, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11585 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
11587 CHECK (4, 32, 9, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11588 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == bytes
[3], 24, 1);
11590 CHECK (2, 16, 10, bytes
[i
] == bytes
[0] && bytes
[i
+ 1] == 0xff, 0, 1);
11592 CHECK (2, 16, 11, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1], 8, 1);
11595 /* Shifting ones / 8-bit / 64-bit variants only checked
11596 for 'ALL' (MOVI/MVNI). */
11597 if (which
== AARCH64_CHECK_MOV
)
11599 CHECK (4, 32, 12, bytes
[i
] == 0xff && bytes
[i
+ 1] == bytes
[1]
11600 && bytes
[i
+ 2] == 0 && bytes
[i
+ 3] == 0, 8, 0);
11602 CHECK (4, 32, 13, bytes
[i
] == 0 && bytes
[i
+ 1] == bytes
[1]
11603 && bytes
[i
+ 2] == 0xff && bytes
[i
+ 3] == 0xff, 8, 1);
11605 CHECK (4, 32, 14, bytes
[i
] == 0xff && bytes
[i
+ 1] == 0xff
11606 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0, 16, 0);
11608 CHECK (4, 32, 15, bytes
[i
] == 0 && bytes
[i
+ 1] == 0
11609 && bytes
[i
+ 2] == bytes
[2] && bytes
[i
+ 3] == 0xff, 16, 1);
11611 CHECK (1, 8, 16, bytes
[i
] == bytes
[0], 0, 0);
11613 CHECK (1, 64, 17, (bytes
[i
] == 0 || bytes
[i
] == 0xff)
11614 && bytes
[i
] == bytes
[(i
+ 8) % idx
], 0, 0);
11624 info
->element_width
= elsize
;
11625 info
->mvn
= emvn
!= 0;
11626 info
->shift
= eshift
;
11628 unsigned HOST_WIDE_INT imm
= 0;
11630 if (immtype
>= 12 && immtype
<= 15)
11633 /* Un-invert bytes of recognized vector, if necessary. */
11635 for (i
= 0; i
< idx
; i
++)
11636 bytes
[i
] ^= invmask
;
11640 /* FIXME: Broken on 32-bit H_W_I hosts. */
11641 gcc_assert (sizeof (HOST_WIDE_INT
) == 8);
11643 for (i
= 0; i
< 8; i
++)
11644 imm
|= (unsigned HOST_WIDE_INT
) (bytes
[i
] ? 0xff : 0)
11645 << (i
* BITS_PER_UNIT
);
11648 info
->value
= GEN_INT (imm
);
11652 for (i
= 0; i
< elsize
/ BITS_PER_UNIT
; i
++)
11653 imm
|= (unsigned HOST_WIDE_INT
) bytes
[i
] << (i
* BITS_PER_UNIT
);
11655 /* Construct 'abcdefgh' because the assembler cannot handle
11656 generic constants. */
11659 imm
= (imm
>> info
->shift
) & 0xff;
11660 info
->value
= GEN_INT (imm
);
11668 /* Check of immediate shift constants are within range. */
11670 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
11672 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
11674 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
11676 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
11679 /* Return true if X is a uniform vector where all elements
11680 are either the floating-point constant 0.0 or the
11681 integer constant 0. */
11683 aarch64_simd_imm_zero_p (rtx x
, machine_mode mode
)
11685 return x
== CONST0_RTX (mode
);
11689 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11690 operation of width WIDTH at bit position POS. */
11693 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
11695 gcc_assert (CONST_INT_P (width
));
11696 gcc_assert (CONST_INT_P (pos
));
11698 unsigned HOST_WIDE_INT mask
11699 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
11700 return GEN_INT (mask
<< UINTVAL (pos
));
11704 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
11706 if (GET_CODE (x
) == HIGH
11707 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
11710 if (CONST_INT_P (x
))
11713 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
11716 return aarch64_classify_symbolic_expression (x
)
11717 == SYMBOL_TINY_ABSOLUTE
;
11720 /* Return a const_int vector of VAL. */
11722 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
11724 int nunits
= GET_MODE_NUNITS (mode
);
11725 rtvec v
= rtvec_alloc (nunits
);
11728 rtx cache
= GEN_INT (val
);
11730 for (i
=0; i
< nunits
; i
++)
11731 RTVEC_ELT (v
, i
) = cache
;
11733 return gen_rtx_CONST_VECTOR (mode
, v
);
11736 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11739 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
11741 machine_mode vmode
;
11743 vmode
= aarch64_preferred_simd_mode (mode
);
11744 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
11745 return aarch64_simd_valid_immediate (op_v
, vmode
, false, NULL
);
11748 /* Construct and return a PARALLEL RTX vector with elements numbering the
11749 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11750 the vector - from the perspective of the architecture. This does not
11751 line up with GCC's perspective on lane numbers, so we end up with
11752 different masks depending on our target endian-ness. The diagram
11753 below may help. We must draw the distinction when building masks
11754 which select one half of the vector. An instruction selecting
11755 architectural low-lanes for a big-endian target, must be described using
11756 a mask selecting GCC high-lanes.
11758 Big-Endian Little-Endian
11760 GCC 0 1 2 3 3 2 1 0
11761 | x | x | x | x | | x | x | x | x |
11762 Architecture 3 2 1 0 3 2 1 0
11764 Low Mask: { 2, 3 } { 0, 1 }
11765 High Mask: { 0, 1 } { 2, 3 }
11769 aarch64_simd_vect_par_cnst_half (machine_mode mode
, bool high
)
11771 int nunits
= GET_MODE_NUNITS (mode
);
11772 rtvec v
= rtvec_alloc (nunits
/ 2);
11773 int high_base
= nunits
/ 2;
11779 if (BYTES_BIG_ENDIAN
)
11780 base
= high
? low_base
: high_base
;
11782 base
= high
? high_base
: low_base
;
11784 for (i
= 0; i
< nunits
/ 2; i
++)
11785 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
11787 t1
= gen_rtx_PARALLEL (mode
, v
);
11791 /* Check OP for validity as a PARALLEL RTX vector with elements
11792 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11793 from the perspective of the architecture. See the diagram above
11794 aarch64_simd_vect_par_cnst_half for more details. */
11797 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
11800 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, high
);
11801 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
11802 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
11805 if (!VECTOR_MODE_P (mode
))
11808 if (count_op
!= count_ideal
)
11811 for (i
= 0; i
< count_ideal
; i
++)
11813 rtx elt_op
= XVECEXP (op
, 0, i
);
11814 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
11816 if (!CONST_INT_P (elt_op
)
11817 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
11823 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11824 HIGH (exclusive). */
11826 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
11829 HOST_WIDE_INT lane
;
11830 gcc_assert (CONST_INT_P (operand
));
11831 lane
= INTVAL (operand
);
11833 if (lane
< low
|| lane
>= high
)
11836 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
11838 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
11842 /* Return TRUE if OP is a valid vector addressing mode. */
11844 aarch64_simd_mem_operand_p (rtx op
)
11846 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
11847 || REG_P (XEXP (op
, 0)));
11850 /* Emit a register copy from operand to operand, taking care not to
11851 early-clobber source registers in the process.
11853 COUNT is the number of components into which the copy needs to be
11856 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
11857 unsigned int count
)
11860 int rdest
= REGNO (operands
[0]);
11861 int rsrc
= REGNO (operands
[1]);
11863 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
11865 for (i
= 0; i
< count
; i
++)
11866 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
11867 gen_rtx_REG (mode
, rsrc
+ i
));
11869 for (i
= 0; i
< count
; i
++)
11870 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
11871 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
11874 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11875 one of VSTRUCT modes: OI, CI, or XI. */
11877 aarch64_simd_attr_length_rglist (machine_mode mode
)
11879 return (GET_MODE_SIZE (mode
) / UNITS_PER_VREG
) * 4;
11882 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11883 alignment of a vector to 128 bits. */
11884 static HOST_WIDE_INT
11885 aarch64_simd_vector_alignment (const_tree type
)
11887 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
11888 return MIN (align
, 128);
11891 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11893 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
11898 /* We guarantee alignment for vectors up to 128-bits. */
11899 if (tree_int_cst_compare (TYPE_SIZE (type
),
11900 bitsize_int (BIGGEST_ALIGNMENT
)) > 0)
11903 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11907 /* Return true if the vector misalignment factor is supported by the
11910 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
11911 const_tree type
, int misalignment
,
11914 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
11916 /* Return if movmisalign pattern is not supported for this mode. */
11917 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
11920 /* Misalignment factor is unknown at compile time. */
11921 if (misalignment
== -1)
11924 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
11928 /* If VALS is a vector constant that can be loaded into a register
11929 using DUP, generate instructions to do so and return an RTX to
11930 assign to the register. Otherwise return NULL_RTX. */
11932 aarch64_simd_dup_constant (rtx vals
)
11934 machine_mode mode
= GET_MODE (vals
);
11935 machine_mode inner_mode
= GET_MODE_INNER (mode
);
11938 if (!const_vec_duplicate_p (vals
, &x
))
11941 /* We can load this constant by using DUP and a constant in a
11942 single ARM register. This will be cheaper than a vector
11944 x
= copy_to_mode_reg (inner_mode
, x
);
11945 return gen_rtx_VEC_DUPLICATE (mode
, x
);
11949 /* Generate code to load VALS, which is a PARALLEL containing only
11950 constants (for vec_init) or CONST_VECTOR, efficiently into a
11951 register. Returns an RTX to copy into the register, or NULL_RTX
11952 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11954 aarch64_simd_make_constant (rtx vals
)
11956 machine_mode mode
= GET_MODE (vals
);
11958 rtx const_vec
= NULL_RTX
;
11959 int n_elts
= GET_MODE_NUNITS (mode
);
11963 if (GET_CODE (vals
) == CONST_VECTOR
)
11965 else if (GET_CODE (vals
) == PARALLEL
)
11967 /* A CONST_VECTOR must contain only CONST_INTs and
11968 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11969 Only store valid constants in a CONST_VECTOR. */
11970 for (i
= 0; i
< n_elts
; ++i
)
11972 rtx x
= XVECEXP (vals
, 0, i
);
11973 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
11976 if (n_const
== n_elts
)
11977 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
11980 gcc_unreachable ();
11982 if (const_vec
!= NULL_RTX
11983 && aarch64_simd_valid_immediate (const_vec
, mode
, false, NULL
))
11984 /* Load using MOVI/MVNI. */
11986 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
11987 /* Loaded using DUP. */
11989 else if (const_vec
!= NULL_RTX
)
11990 /* Load from constant pool. We can not take advantage of single-cycle
11991 LD1 because we need a PC-relative addressing mode. */
11994 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11995 We can not construct an initializer. */
11999 /* Expand a vector initialisation sequence, such that TARGET is
12000 initialised to contain VALS. */
12003 aarch64_expand_vector_init (rtx target
, rtx vals
)
12005 machine_mode mode
= GET_MODE (target
);
12006 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
12007 /* The number of vector elements. */
12008 int n_elts
= GET_MODE_NUNITS (mode
);
12009 /* The number of vector elements which are not constant. */
12011 rtx any_const
= NULL_RTX
;
12012 /* The first element of vals. */
12013 rtx v0
= XVECEXP (vals
, 0, 0);
12014 bool all_same
= true;
12016 /* Count the number of variable elements to initialise. */
12017 for (int i
= 0; i
< n_elts
; ++i
)
12019 rtx x
= XVECEXP (vals
, 0, i
);
12020 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
12025 all_same
&= rtx_equal_p (x
, v0
);
12028 /* No variable elements, hand off to aarch64_simd_make_constant which knows
12029 how best to handle this. */
12032 rtx constant
= aarch64_simd_make_constant (vals
);
12033 if (constant
!= NULL_RTX
)
12035 emit_move_insn (target
, constant
);
12040 /* Splat a single non-constant element if we can. */
12043 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
12044 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
12048 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
12049 gcc_assert (icode
!= CODE_FOR_nothing
);
12051 /* If there are only variable elements, try to optimize
12052 the insertion using dup for the most common element
12053 followed by insertions. */
12055 /* The algorithm will fill matches[*][0] with the earliest matching element,
12056 and matches[X][1] with the count of duplicate elements (if X is the
12057 earliest element which has duplicates). */
12059 if (n_var
== n_elts
&& n_elts
<= 16)
12061 int matches
[16][2] = {0};
12062 for (int i
= 0; i
< n_elts
; i
++)
12064 for (int j
= 0; j
<= i
; j
++)
12066 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
12074 int maxelement
= 0;
12076 for (int i
= 0; i
< n_elts
; i
++)
12077 if (matches
[i
][1] > maxv
)
12080 maxv
= matches
[i
][1];
12083 /* Create a duplicate of the most common element. */
12084 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
12085 aarch64_emit_move (target
, gen_rtx_VEC_DUPLICATE (mode
, x
));
12087 /* Insert the rest. */
12088 for (int i
= 0; i
< n_elts
; i
++)
12090 rtx x
= XVECEXP (vals
, 0, i
);
12091 if (matches
[i
][0] == maxelement
)
12093 x
= copy_to_mode_reg (inner_mode
, x
);
12094 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
12099 /* Initialise a vector which is part-variable. We want to first try
12100 to build those lanes which are constant in the most efficient way we
12102 if (n_var
!= n_elts
)
12104 rtx copy
= copy_rtx (vals
);
12106 /* Load constant part of vector. We really don't care what goes into the
12107 parts we will overwrite, but we're more likely to be able to load the
12108 constant efficiently if it has fewer, larger, repeating parts
12109 (see aarch64_simd_valid_immediate). */
12110 for (int i
= 0; i
< n_elts
; i
++)
12112 rtx x
= XVECEXP (vals
, 0, i
);
12113 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
12115 rtx subst
= any_const
;
12116 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
12118 /* Look in the copied vector, as more elements are const. */
12119 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
12120 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
12126 XVECEXP (copy
, 0, i
) = subst
;
12128 aarch64_expand_vector_init (target
, copy
);
12131 /* Insert the variable lanes directly. */
12132 for (int i
= 0; i
< n_elts
; i
++)
12134 rtx x
= XVECEXP (vals
, 0, i
);
12135 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
12137 x
= copy_to_mode_reg (inner_mode
, x
);
12138 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
12142 static unsigned HOST_WIDE_INT
12143 aarch64_shift_truncation_mask (machine_mode mode
)
12146 (!SHIFT_COUNT_TRUNCATED
12147 || aarch64_vector_mode_supported_p (mode
)
12148 || aarch64_vect_struct_mode_p (mode
)) ? 0 : (GET_MODE_BITSIZE (mode
) - 1);
12151 /* Select a format to encode pointers in exception handling data. */
12153 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
12156 switch (aarch64_cmodel
)
12158 case AARCH64_CMODEL_TINY
:
12159 case AARCH64_CMODEL_TINY_PIC
:
12160 case AARCH64_CMODEL_SMALL
:
12161 case AARCH64_CMODEL_SMALL_PIC
:
12162 case AARCH64_CMODEL_SMALL_SPIC
:
12163 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
12165 type
= DW_EH_PE_sdata4
;
12168 /* No assumptions here. 8-byte relocs required. */
12169 type
= DW_EH_PE_sdata8
;
12172 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
12175 /* The last .arch and .tune assembly strings that we printed. */
12176 static std::string aarch64_last_printed_arch_string
;
12177 static std::string aarch64_last_printed_tune_string
;
12179 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
12180 by the function fndecl. */
12183 aarch64_declare_function_name (FILE *stream
, const char* name
,
12186 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
12188 struct cl_target_option
*targ_options
;
12190 targ_options
= TREE_TARGET_OPTION (target_parts
);
12192 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
12193 gcc_assert (targ_options
);
12195 const struct processor
*this_arch
12196 = aarch64_get_arch (targ_options
->x_explicit_arch
);
12198 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
12199 std::string extension
12200 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
12202 /* Only update the assembler .arch string if it is distinct from the last
12203 such string we printed. */
12204 std::string to_print
= this_arch
->name
+ extension
;
12205 if (to_print
!= aarch64_last_printed_arch_string
)
12207 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
12208 aarch64_last_printed_arch_string
= to_print
;
12211 /* Print the cpu name we're tuning for in the comments, might be
12212 useful to readers of the generated asm. Do it only when it changes
12213 from function to function and verbose assembly is requested. */
12214 const struct processor
*this_tune
12215 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
12217 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
12219 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
12221 aarch64_last_printed_tune_string
= this_tune
->name
;
12224 /* Don't forget the type directive for ELF. */
12225 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
12226 ASM_OUTPUT_LABEL (stream
, name
);
12229 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
12232 aarch64_start_file (void)
12234 struct cl_target_option
*default_options
12235 = TREE_TARGET_OPTION (target_option_default_node
);
12237 const struct processor
*default_arch
12238 = aarch64_get_arch (default_options
->x_explicit_arch
);
12239 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
12240 std::string extension
12241 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
12242 default_arch
->flags
);
12244 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
12245 aarch64_last_printed_tune_string
= "";
12246 asm_fprintf (asm_out_file
, "\t.arch %s\n",
12247 aarch64_last_printed_arch_string
.c_str ());
12249 default_file_start ();
12252 /* Emit load exclusive. */
12255 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
12256 rtx mem
, rtx model_rtx
)
12258 rtx (*gen
) (rtx
, rtx
, rtx
);
12262 case E_QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
12263 case E_HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
12264 case E_SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
12265 case E_DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
12267 gcc_unreachable ();
12270 emit_insn (gen (rval
, mem
, model_rtx
));
12273 /* Emit store exclusive. */
12276 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
12277 rtx rval
, rtx mem
, rtx model_rtx
)
12279 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12283 case E_QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
12284 case E_HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
12285 case E_SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
12286 case E_DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
12288 gcc_unreachable ();
12291 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
12294 /* Mark the previous jump instruction as unlikely. */
12297 aarch64_emit_unlikely_jump (rtx insn
)
12299 rtx_insn
*jump
= emit_jump_insn (insn
);
12300 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
12303 /* Expand a compare and swap pattern. */
12306 aarch64_expand_compare_and_swap (rtx operands
[])
12308 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
12309 machine_mode mode
, cmp_mode
;
12310 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
12313 const gen_cas_fn split_cas
[] =
12315 gen_aarch64_compare_and_swapqi
,
12316 gen_aarch64_compare_and_swaphi
,
12317 gen_aarch64_compare_and_swapsi
,
12318 gen_aarch64_compare_and_swapdi
12320 const gen_cas_fn atomic_cas
[] =
12322 gen_aarch64_compare_and_swapqi_lse
,
12323 gen_aarch64_compare_and_swaphi_lse
,
12324 gen_aarch64_compare_and_swapsi_lse
,
12325 gen_aarch64_compare_and_swapdi_lse
12328 bval
= operands
[0];
12329 rval
= operands
[1];
12331 oldval
= operands
[3];
12332 newval
= operands
[4];
12333 is_weak
= operands
[5];
12334 mod_s
= operands
[6];
12335 mod_f
= operands
[7];
12336 mode
= GET_MODE (mem
);
12339 /* Normally the succ memory model must be stronger than fail, but in the
12340 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
12341 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
12343 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
12344 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
12345 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
12351 /* For short modes, we're going to perform the comparison in SImode,
12352 so do the zero-extension now. */
12354 rval
= gen_reg_rtx (SImode
);
12355 oldval
= convert_modes (SImode
, mode
, oldval
, true);
12356 /* Fall through. */
12360 /* Force the value into a register if needed. */
12361 if (!aarch64_plus_operand (oldval
, mode
))
12362 oldval
= force_reg (cmp_mode
, oldval
);
12366 gcc_unreachable ();
12371 case E_QImode
: idx
= 0; break;
12372 case E_HImode
: idx
= 1; break;
12373 case E_SImode
: idx
= 2; break;
12374 case E_DImode
: idx
= 3; break;
12376 gcc_unreachable ();
12379 gen
= atomic_cas
[idx
];
12381 gen
= split_cas
[idx
];
12383 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
12385 if (mode
== QImode
|| mode
== HImode
)
12386 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
12388 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12389 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
12390 emit_insn (gen_rtx_SET (bval
, x
));
12393 /* Test whether the target supports using a atomic load-operate instruction.
12394 CODE is the operation and AFTER is TRUE if the data in memory after the
12395 operation should be returned and FALSE if the data before the operation
12396 should be returned. Returns FALSE if the operation isn't supported by the
12400 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
12419 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12420 sequence implementing an atomic operation. */
12423 aarch64_emit_post_barrier (enum memmodel model
)
12425 const enum memmodel base_model
= memmodel_base (model
);
12427 if (is_mm_sync (model
)
12428 && (base_model
== MEMMODEL_ACQUIRE
12429 || base_model
== MEMMODEL_ACQ_REL
12430 || base_model
== MEMMODEL_SEQ_CST
))
12432 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
12436 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12437 for the data in memory. EXPECTED is the value expected to be in memory.
12438 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12439 is the memory ordering to use. */
12442 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
12443 rtx expected
, rtx desired
,
12446 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12449 mode
= GET_MODE (mem
);
12453 case E_QImode
: gen
= gen_aarch64_atomic_casqi
; break;
12454 case E_HImode
: gen
= gen_aarch64_atomic_cashi
; break;
12455 case E_SImode
: gen
= gen_aarch64_atomic_cassi
; break;
12456 case E_DImode
: gen
= gen_aarch64_atomic_casdi
; break;
12458 gcc_unreachable ();
12461 /* Move the expected value into the CAS destination register. */
12462 emit_insn (gen_rtx_SET (rval
, expected
));
12464 /* Emit the CAS. */
12465 emit_insn (gen (rval
, mem
, desired
, model
));
12467 /* Compare the expected value with the value loaded by the CAS, to establish
12468 whether the swap was made. */
12469 aarch64_gen_compare_reg (EQ
, rval
, expected
);
12472 /* Split a compare and swap pattern. */
12475 aarch64_split_compare_and_swap (rtx operands
[])
12477 rtx rval
, mem
, oldval
, newval
, scratch
;
12480 rtx_code_label
*label1
, *label2
;
12482 enum memmodel model
;
12485 rval
= operands
[0];
12487 oldval
= operands
[2];
12488 newval
= operands
[3];
12489 is_weak
= (operands
[4] != const0_rtx
);
12490 model_rtx
= operands
[5];
12491 scratch
= operands
[7];
12492 mode
= GET_MODE (mem
);
12493 model
= memmodel_from_int (INTVAL (model_rtx
));
12495 /* When OLDVAL is zero and we want the strong version we can emit a tighter
12498 LD[A]XR rval, [mem]
12500 ST[L]XR scratch, newval, [mem]
12501 CBNZ scratch, .label1
12504 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
12509 label1
= gen_label_rtx ();
12510 emit_label (label1
);
12512 label2
= gen_label_rtx ();
12514 /* The initial load can be relaxed for a __sync operation since a final
12515 barrier will be emitted to stop code hoisting. */
12516 if (is_mm_sync (model
))
12517 aarch64_emit_load_exclusive (mode
, rval
, mem
,
12518 GEN_INT (MEMMODEL_RELAXED
));
12520 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
12524 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
12525 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12526 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
12527 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12531 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
12532 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12533 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12534 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
12535 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12538 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
12542 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
12543 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12544 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
12545 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12549 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12550 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
12551 emit_insn (gen_rtx_SET (cond
, x
));
12554 emit_label (label2
);
12555 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
12556 to set the condition flags. If this is not used it will be removed by
12560 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
12561 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
12562 emit_insn (gen_rtx_SET (cond
, x
));
12564 /* Emit any final barrier needed for a __sync operation. */
12565 if (is_mm_sync (model
))
12566 aarch64_emit_post_barrier (model
);
12569 /* Emit a BIC instruction. */
12572 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
12574 rtx shift_rtx
= GEN_INT (shift
);
12575 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12579 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
12580 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
12582 gcc_unreachable ();
12585 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
12588 /* Emit an atomic swap. */
12591 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
12592 rtx mem
, rtx model
)
12594 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
12598 case E_QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
12599 case E_HImode
: gen
= gen_aarch64_atomic_swphi
; break;
12600 case E_SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
12601 case E_DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
12603 gcc_unreachable ();
12606 emit_insn (gen (dst
, mem
, value
, model
));
12609 /* Operations supported by aarch64_emit_atomic_load_op. */
12611 enum aarch64_atomic_load_op_code
12613 AARCH64_LDOP_PLUS
, /* A + B */
12614 AARCH64_LDOP_XOR
, /* A ^ B */
12615 AARCH64_LDOP_OR
, /* A | B */
12616 AARCH64_LDOP_BIC
/* A & ~B */
12619 /* Emit an atomic load-operate. */
12622 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
12623 machine_mode mode
, rtx dst
, rtx src
,
12624 rtx mem
, rtx model
)
12626 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
12627 const aarch64_atomic_load_op_fn plus
[] =
12629 gen_aarch64_atomic_loadaddqi
,
12630 gen_aarch64_atomic_loadaddhi
,
12631 gen_aarch64_atomic_loadaddsi
,
12632 gen_aarch64_atomic_loadadddi
12634 const aarch64_atomic_load_op_fn eor
[] =
12636 gen_aarch64_atomic_loadeorqi
,
12637 gen_aarch64_atomic_loadeorhi
,
12638 gen_aarch64_atomic_loadeorsi
,
12639 gen_aarch64_atomic_loadeordi
12641 const aarch64_atomic_load_op_fn ior
[] =
12643 gen_aarch64_atomic_loadsetqi
,
12644 gen_aarch64_atomic_loadsethi
,
12645 gen_aarch64_atomic_loadsetsi
,
12646 gen_aarch64_atomic_loadsetdi
12648 const aarch64_atomic_load_op_fn bic
[] =
12650 gen_aarch64_atomic_loadclrqi
,
12651 gen_aarch64_atomic_loadclrhi
,
12652 gen_aarch64_atomic_loadclrsi
,
12653 gen_aarch64_atomic_loadclrdi
12655 aarch64_atomic_load_op_fn gen
;
12660 case E_QImode
: idx
= 0; break;
12661 case E_HImode
: idx
= 1; break;
12662 case E_SImode
: idx
= 2; break;
12663 case E_DImode
: idx
= 3; break;
12665 gcc_unreachable ();
12670 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
12671 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
12672 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
12673 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
12675 gcc_unreachable ();
12678 emit_insn (gen (dst
, mem
, src
, model
));
12681 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12682 location to store the data read from memory. OUT_RESULT is the location to
12683 store the result of the operation. MEM is the memory location to read and
12684 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12685 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12689 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
12690 rtx mem
, rtx value
, rtx model_rtx
)
12692 machine_mode mode
= GET_MODE (mem
);
12693 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
12694 const bool short_mode
= (mode
< SImode
);
12695 aarch64_atomic_load_op_code ldop_code
;
12700 out_data
= gen_lowpart (mode
, out_data
);
12703 out_result
= gen_lowpart (mode
, out_result
);
12705 /* Make sure the value is in a register, putting it into a destination
12706 register if it needs to be manipulated. */
12707 if (!register_operand (value
, mode
)
12708 || code
== AND
|| code
== MINUS
)
12710 src
= out_result
? out_result
: out_data
;
12711 emit_move_insn (src
, gen_lowpart (mode
, value
));
12715 gcc_assert (register_operand (src
, mode
));
12717 /* Preprocess the data for the operation as necessary. If the operation is
12718 a SET then emit a swap instruction and finish. */
12722 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
12726 /* Negate the value and treat it as a PLUS. */
12730 /* Resize the value if necessary. */
12732 src
= gen_lowpart (wmode
, src
);
12734 neg_src
= gen_rtx_NEG (wmode
, src
);
12735 emit_insn (gen_rtx_SET (src
, neg_src
));
12738 src
= gen_lowpart (mode
, src
);
12740 /* Fall-through. */
12742 ldop_code
= AARCH64_LDOP_PLUS
;
12746 ldop_code
= AARCH64_LDOP_OR
;
12750 ldop_code
= AARCH64_LDOP_XOR
;
12757 /* Resize the value if necessary. */
12759 src
= gen_lowpart (wmode
, src
);
12761 not_src
= gen_rtx_NOT (wmode
, src
);
12762 emit_insn (gen_rtx_SET (src
, not_src
));
12765 src
= gen_lowpart (mode
, src
);
12767 ldop_code
= AARCH64_LDOP_BIC
;
12771 /* The operation can't be done with atomic instructions. */
12772 gcc_unreachable ();
12775 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
12777 /* If necessary, calculate the data in memory after the update by redoing the
12778 operation from values in registers. */
12784 src
= gen_lowpart (wmode
, src
);
12785 out_data
= gen_lowpart (wmode
, out_data
);
12786 out_result
= gen_lowpart (wmode
, out_result
);
12795 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
12798 x
= gen_rtx_IOR (wmode
, out_data
, src
);
12801 x
= gen_rtx_XOR (wmode
, out_data
, src
);
12804 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
12807 gcc_unreachable ();
12810 emit_set_insn (out_result
, x
);
12815 /* Split an atomic operation. */
12818 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
12819 rtx value
, rtx model_rtx
, rtx cond
)
12821 machine_mode mode
= GET_MODE (mem
);
12822 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
12823 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
12824 const bool is_sync
= is_mm_sync (model
);
12825 rtx_code_label
*label
;
12828 /* Split the atomic operation into a sequence. */
12829 label
= gen_label_rtx ();
12830 emit_label (label
);
12833 new_out
= gen_lowpart (wmode
, new_out
);
12835 old_out
= gen_lowpart (wmode
, old_out
);
12838 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
12840 /* The initial load can be relaxed for a __sync operation since a final
12841 barrier will be emitted to stop code hoisting. */
12843 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
12844 GEN_INT (MEMMODEL_RELAXED
));
12846 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
12855 x
= gen_rtx_AND (wmode
, old_out
, value
);
12856 emit_insn (gen_rtx_SET (new_out
, x
));
12857 x
= gen_rtx_NOT (wmode
, new_out
);
12858 emit_insn (gen_rtx_SET (new_out
, x
));
12862 if (CONST_INT_P (value
))
12864 value
= GEN_INT (-INTVAL (value
));
12867 /* Fall through. */
12870 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
12871 emit_insn (gen_rtx_SET (new_out
, x
));
12875 aarch64_emit_store_exclusive (mode
, cond
, mem
,
12876 gen_lowpart (mode
, new_out
), model_rtx
);
12878 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
12879 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
12880 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
12881 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
12883 /* Emit any final barrier needed for a __sync operation. */
12885 aarch64_emit_post_barrier (model
);
12889 aarch64_init_libfuncs (void)
12891 /* Half-precision float operations. The compiler handles all operations
12892 with NULL libfuncs by converting to SFmode. */
12895 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
12896 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
12899 set_optab_libfunc (add_optab
, HFmode
, NULL
);
12900 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
12901 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
12902 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
12903 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
12906 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
12907 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
12908 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
12909 set_optab_libfunc (le_optab
, HFmode
, NULL
);
12910 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
12911 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
12912 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
12915 /* Target hook for c_mode_for_suffix. */
12916 static machine_mode
12917 aarch64_c_mode_for_suffix (char suffix
)
12925 /* We can only represent floating point constants which will fit in
12926 "quarter-precision" values. These values are characterised by
12927 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12930 (-1)^s * (n/16) * 2^r
12933 's' is the sign bit.
12934 'n' is an integer in the range 16 <= n <= 31.
12935 'r' is an integer in the range -3 <= r <= 4. */
12937 /* Return true iff X can be represented by a quarter-precision
12938 floating point immediate operand X. Note, we cannot represent 0.0. */
12940 aarch64_float_const_representable_p (rtx x
)
12942 /* This represents our current view of how many bits
12943 make up the mantissa. */
12944 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
12946 unsigned HOST_WIDE_INT mantissa
, mask
;
12947 REAL_VALUE_TYPE r
, m
;
12950 if (!CONST_DOUBLE_P (x
))
12953 /* We don't support HFmode constants yet. */
12954 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
12957 r
= *CONST_DOUBLE_REAL_VALUE (x
);
12959 /* We cannot represent infinities, NaNs or +/-zero. We won't
12960 know if we have +zero until we analyse the mantissa, but we
12961 can reject the other invalid values. */
12962 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
12963 || REAL_VALUE_MINUS_ZERO (r
))
12966 /* Extract exponent. */
12967 r
= real_value_abs (&r
);
12968 exponent
= REAL_EXP (&r
);
12970 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12971 highest (sign) bit, with a fixed binary point at bit point_pos.
12972 m1 holds the low part of the mantissa, m2 the high part.
12973 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12974 bits for the mantissa, this can fail (low bits will be lost). */
12975 real_ldexp (&m
, &r
, point_pos
- exponent
);
12976 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
12978 /* If the low part of the mantissa has bits set we cannot represent
12980 if (w
.ulow () != 0)
12982 /* We have rejected the lower HOST_WIDE_INT, so update our
12983 understanding of how many bits lie in the mantissa and
12984 look only at the high HOST_WIDE_INT. */
12985 mantissa
= w
.elt (1);
12986 point_pos
-= HOST_BITS_PER_WIDE_INT
;
12988 /* We can only represent values with a mantissa of the form 1.xxxx. */
12989 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
12990 if ((mantissa
& mask
) != 0)
12993 /* Having filtered unrepresentable values, we may now remove all
12994 but the highest 5 bits. */
12995 mantissa
>>= point_pos
- 5;
12997 /* We cannot represent the value 0.0, so reject it. This is handled
13002 /* Then, as bit 4 is always set, we can mask it off, leaving
13003 the mantissa in the range [0, 15]. */
13004 mantissa
&= ~(1 << 4);
13005 gcc_assert (mantissa
<= 15);
13007 /* GCC internally does not use IEEE754-like encoding (where normalized
13008 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
13009 Our mantissa values are shifted 4 places to the left relative to
13010 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
13011 by 5 places to correct for GCC's representation. */
13012 exponent
= 5 - exponent
;
13014 return (exponent
>= 0 && exponent
<= 7);
13017 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
13018 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
13019 output MOVI/MVNI, ORR or BIC immediate. */
13021 aarch64_output_simd_mov_immediate (rtx const_vector
,
13024 enum simd_immediate_check which
)
13027 static char templ
[40];
13028 const char *mnemonic
;
13029 const char *shift_op
;
13030 unsigned int lane_count
= 0;
13033 struct simd_immediate_info info
= { NULL_RTX
, 0, 0, false, false };
13035 /* This will return true to show const_vector is legal for use as either
13036 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
13037 It will also update INFO to show how the immediate should be generated.
13038 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
13039 is_valid
= aarch64_simd_valid_immediate (const_vector
, mode
, false,
13041 gcc_assert (is_valid
);
13043 element_char
= sizetochar (info
.element_width
);
13044 lane_count
= width
/ info
.element_width
;
13046 mode
= GET_MODE_INNER (mode
);
13047 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13049 gcc_assert (info
.shift
== 0 && ! info
.mvn
);
13050 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
13051 move immediate path. */
13052 if (aarch64_float_const_zero_rtx_p (info
.value
))
13053 info
.value
= GEN_INT (0);
13056 const unsigned int buf_size
= 20;
13057 char float_buf
[buf_size
] = {'\0'};
13058 real_to_decimal_for_mode (float_buf
,
13059 CONST_DOUBLE_REAL_VALUE (info
.value
),
13060 buf_size
, buf_size
, 1, mode
);
13062 if (lane_count
== 1)
13063 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
13065 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
13066 lane_count
, element_char
, float_buf
);
13071 gcc_assert (CONST_INT_P (info
.value
));
13073 if (which
== AARCH64_CHECK_MOV
)
13075 mnemonic
= info
.mvn
? "mvni" : "movi";
13076 shift_op
= info
.msl
? "msl" : "lsl";
13077 if (lane_count
== 1)
13078 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
13079 mnemonic
, UINTVAL (info
.value
));
13080 else if (info
.shift
)
13081 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
13082 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
13083 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
13085 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
13086 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
13087 element_char
, UINTVAL (info
.value
));
13091 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
13092 mnemonic
= info
.mvn
? "bic" : "orr";
13094 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
13095 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
13096 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
13098 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
13099 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
13100 element_char
, UINTVAL (info
.value
));
13106 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
13109 /* If a floating point number was passed and we desire to use it in an
13110 integer mode do the conversion to integer. */
13111 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
13113 unsigned HOST_WIDE_INT ival
;
13114 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
13115 gcc_unreachable ();
13116 immediate
= gen_int_mode (ival
, mode
);
13119 machine_mode vmode
;
13120 /* use a 64 bit mode for everything except for DI/DF mode, where we use
13121 a 128 bit vector mode. */
13122 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
13124 vmode
= aarch64_simd_container_mode (mode
, width
);
13125 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
13126 return aarch64_output_simd_mov_immediate (v_op
, vmode
, width
);
13129 /* Split operands into moves from op[1] + op[2] into op[0]. */
13132 aarch64_split_combinev16qi (rtx operands
[3])
13134 unsigned int dest
= REGNO (operands
[0]);
13135 unsigned int src1
= REGNO (operands
[1]);
13136 unsigned int src2
= REGNO (operands
[2]);
13137 machine_mode halfmode
= GET_MODE (operands
[1]);
13138 unsigned int halfregs
= REG_NREGS (operands
[1]);
13139 rtx destlo
, desthi
;
13141 gcc_assert (halfmode
== V16QImode
);
13143 if (src1
== dest
&& src2
== dest
+ halfregs
)
13145 /* No-op move. Can't split to nothing; emit something. */
13146 emit_note (NOTE_INSN_DELETED
);
13150 /* Preserve register attributes for variable tracking. */
13151 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
13152 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
13153 GET_MODE_SIZE (halfmode
));
13155 /* Special case of reversed high/low parts. */
13156 if (reg_overlap_mentioned_p (operands
[2], destlo
)
13157 && reg_overlap_mentioned_p (operands
[1], desthi
))
13159 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
13160 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
13161 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
13163 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
13165 /* Try to avoid unnecessary moves if part of the result
13166 is in the right place already. */
13168 emit_move_insn (destlo
, operands
[1]);
13169 if (src2
!= dest
+ halfregs
)
13170 emit_move_insn (desthi
, operands
[2]);
13174 if (src2
!= dest
+ halfregs
)
13175 emit_move_insn (desthi
, operands
[2]);
13177 emit_move_insn (destlo
, operands
[1]);
13181 /* vec_perm support. */
13183 #define MAX_VECT_LEN 16
13185 struct expand_vec_perm_d
13187 rtx target
, op0
, op1
;
13188 auto_vec_perm_indices perm
;
13189 machine_mode vmode
;
13194 /* Generate a variable permutation. */
13197 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13199 machine_mode vmode
= GET_MODE (target
);
13200 bool one_vector_p
= rtx_equal_p (op0
, op1
);
13202 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
13203 gcc_checking_assert (GET_MODE (op0
) == vmode
);
13204 gcc_checking_assert (GET_MODE (op1
) == vmode
);
13205 gcc_checking_assert (GET_MODE (sel
) == vmode
);
13206 gcc_checking_assert (TARGET_SIMD
);
13210 if (vmode
== V8QImode
)
13212 /* Expand the argument to a V16QI mode by duplicating it. */
13213 rtx pair
= gen_reg_rtx (V16QImode
);
13214 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
13215 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
13219 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
13226 if (vmode
== V8QImode
)
13228 pair
= gen_reg_rtx (V16QImode
);
13229 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
13230 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
13234 pair
= gen_reg_rtx (OImode
);
13235 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
13236 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
13242 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13244 machine_mode vmode
= GET_MODE (target
);
13245 unsigned int nelt
= GET_MODE_NUNITS (vmode
);
13246 bool one_vector_p
= rtx_equal_p (op0
, op1
);
13249 /* The TBL instruction does not use a modulo index, so we must take care
13250 of that ourselves. */
13251 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
13252 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13253 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
13255 /* For big-endian, we also need to reverse the index within the vector
13256 (but not which vector). */
13257 if (BYTES_BIG_ENDIAN
)
13259 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
13261 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
13262 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
13263 NULL
, 0, OPTAB_LIB_WIDEN
);
13265 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
13268 /* Recognize patterns suitable for the TRN instructions. */
13270 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
13272 unsigned int i
, odd
, mask
, nelt
= d
->perm
.length ();
13273 rtx out
, in0
, in1
, x
;
13274 rtx (*gen
) (rtx
, rtx
, rtx
);
13275 machine_mode vmode
= d
->vmode
;
13277 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13280 /* Note that these are little-endian tests.
13281 We correct for big-endian later. */
13282 if (d
->perm
[0] == 0)
13284 else if (d
->perm
[0] == 1)
13288 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13290 for (i
= 0; i
< nelt
; i
+= 2)
13292 if (d
->perm
[i
] != i
+ odd
)
13294 if (d
->perm
[i
+ 1] != ((i
+ nelt
+ odd
) & mask
))
13304 if (BYTES_BIG_ENDIAN
)
13306 x
= in0
, in0
= in1
, in1
= x
;
13315 case E_V16QImode
: gen
= gen_aarch64_trn2v16qi
; break;
13316 case E_V8QImode
: gen
= gen_aarch64_trn2v8qi
; break;
13317 case E_V8HImode
: gen
= gen_aarch64_trn2v8hi
; break;
13318 case E_V4HImode
: gen
= gen_aarch64_trn2v4hi
; break;
13319 case E_V4SImode
: gen
= gen_aarch64_trn2v4si
; break;
13320 case E_V2SImode
: gen
= gen_aarch64_trn2v2si
; break;
13321 case E_V2DImode
: gen
= gen_aarch64_trn2v2di
; break;
13322 case E_V4HFmode
: gen
= gen_aarch64_trn2v4hf
; break;
13323 case E_V8HFmode
: gen
= gen_aarch64_trn2v8hf
; break;
13324 case E_V4SFmode
: gen
= gen_aarch64_trn2v4sf
; break;
13325 case E_V2SFmode
: gen
= gen_aarch64_trn2v2sf
; break;
13326 case E_V2DFmode
: gen
= gen_aarch64_trn2v2df
; break;
13335 case E_V16QImode
: gen
= gen_aarch64_trn1v16qi
; break;
13336 case E_V8QImode
: gen
= gen_aarch64_trn1v8qi
; break;
13337 case E_V8HImode
: gen
= gen_aarch64_trn1v8hi
; break;
13338 case E_V4HImode
: gen
= gen_aarch64_trn1v4hi
; break;
13339 case E_V4SImode
: gen
= gen_aarch64_trn1v4si
; break;
13340 case E_V2SImode
: gen
= gen_aarch64_trn1v2si
; break;
13341 case E_V2DImode
: gen
= gen_aarch64_trn1v2di
; break;
13342 case E_V4HFmode
: gen
= gen_aarch64_trn1v4hf
; break;
13343 case E_V8HFmode
: gen
= gen_aarch64_trn1v8hf
; break;
13344 case E_V4SFmode
: gen
= gen_aarch64_trn1v4sf
; break;
13345 case E_V2SFmode
: gen
= gen_aarch64_trn1v2sf
; break;
13346 case E_V2DFmode
: gen
= gen_aarch64_trn1v2df
; break;
13352 emit_insn (gen (out
, in0
, in1
));
13356 /* Recognize patterns suitable for the UZP instructions. */
13358 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
13360 unsigned int i
, odd
, mask
, nelt
= d
->perm
.length ();
13361 rtx out
, in0
, in1
, x
;
13362 rtx (*gen
) (rtx
, rtx
, rtx
);
13363 machine_mode vmode
= d
->vmode
;
13365 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13368 /* Note that these are little-endian tests.
13369 We correct for big-endian later. */
13370 if (d
->perm
[0] == 0)
13372 else if (d
->perm
[0] == 1)
13376 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13378 for (i
= 0; i
< nelt
; i
++)
13380 unsigned elt
= (i
* 2 + odd
) & mask
;
13381 if (d
->perm
[i
] != elt
)
13391 if (BYTES_BIG_ENDIAN
)
13393 x
= in0
, in0
= in1
, in1
= x
;
13402 case E_V16QImode
: gen
= gen_aarch64_uzp2v16qi
; break;
13403 case E_V8QImode
: gen
= gen_aarch64_uzp2v8qi
; break;
13404 case E_V8HImode
: gen
= gen_aarch64_uzp2v8hi
; break;
13405 case E_V4HImode
: gen
= gen_aarch64_uzp2v4hi
; break;
13406 case E_V4SImode
: gen
= gen_aarch64_uzp2v4si
; break;
13407 case E_V2SImode
: gen
= gen_aarch64_uzp2v2si
; break;
13408 case E_V2DImode
: gen
= gen_aarch64_uzp2v2di
; break;
13409 case E_V4HFmode
: gen
= gen_aarch64_uzp2v4hf
; break;
13410 case E_V8HFmode
: gen
= gen_aarch64_uzp2v8hf
; break;
13411 case E_V4SFmode
: gen
= gen_aarch64_uzp2v4sf
; break;
13412 case E_V2SFmode
: gen
= gen_aarch64_uzp2v2sf
; break;
13413 case E_V2DFmode
: gen
= gen_aarch64_uzp2v2df
; break;
13422 case E_V16QImode
: gen
= gen_aarch64_uzp1v16qi
; break;
13423 case E_V8QImode
: gen
= gen_aarch64_uzp1v8qi
; break;
13424 case E_V8HImode
: gen
= gen_aarch64_uzp1v8hi
; break;
13425 case E_V4HImode
: gen
= gen_aarch64_uzp1v4hi
; break;
13426 case E_V4SImode
: gen
= gen_aarch64_uzp1v4si
; break;
13427 case E_V2SImode
: gen
= gen_aarch64_uzp1v2si
; break;
13428 case E_V2DImode
: gen
= gen_aarch64_uzp1v2di
; break;
13429 case E_V4HFmode
: gen
= gen_aarch64_uzp1v4hf
; break;
13430 case E_V8HFmode
: gen
= gen_aarch64_uzp1v8hf
; break;
13431 case E_V4SFmode
: gen
= gen_aarch64_uzp1v4sf
; break;
13432 case E_V2SFmode
: gen
= gen_aarch64_uzp1v2sf
; break;
13433 case E_V2DFmode
: gen
= gen_aarch64_uzp1v2df
; break;
13439 emit_insn (gen (out
, in0
, in1
));
13443 /* Recognize patterns suitable for the ZIP instructions. */
13445 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
13447 unsigned int i
, high
, mask
, nelt
= d
->perm
.length ();
13448 rtx out
, in0
, in1
, x
;
13449 rtx (*gen
) (rtx
, rtx
, rtx
);
13450 machine_mode vmode
= d
->vmode
;
13452 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
13455 /* Note that these are little-endian tests.
13456 We correct for big-endian later. */
13458 if (d
->perm
[0] == high
)
13461 else if (d
->perm
[0] == 0)
13465 mask
= (d
->one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
13467 for (i
= 0; i
< nelt
/ 2; i
++)
13469 unsigned elt
= (i
+ high
) & mask
;
13470 if (d
->perm
[i
* 2] != elt
)
13472 elt
= (elt
+ nelt
) & mask
;
13473 if (d
->perm
[i
* 2 + 1] != elt
)
13483 if (BYTES_BIG_ENDIAN
)
13485 x
= in0
, in0
= in1
, in1
= x
;
13494 case E_V16QImode
: gen
= gen_aarch64_zip2v16qi
; break;
13495 case E_V8QImode
: gen
= gen_aarch64_zip2v8qi
; break;
13496 case E_V8HImode
: gen
= gen_aarch64_zip2v8hi
; break;
13497 case E_V4HImode
: gen
= gen_aarch64_zip2v4hi
; break;
13498 case E_V4SImode
: gen
= gen_aarch64_zip2v4si
; break;
13499 case E_V2SImode
: gen
= gen_aarch64_zip2v2si
; break;
13500 case E_V2DImode
: gen
= gen_aarch64_zip2v2di
; break;
13501 case E_V4HFmode
: gen
= gen_aarch64_zip2v4hf
; break;
13502 case E_V8HFmode
: gen
= gen_aarch64_zip2v8hf
; break;
13503 case E_V4SFmode
: gen
= gen_aarch64_zip2v4sf
; break;
13504 case E_V2SFmode
: gen
= gen_aarch64_zip2v2sf
; break;
13505 case E_V2DFmode
: gen
= gen_aarch64_zip2v2df
; break;
13514 case E_V16QImode
: gen
= gen_aarch64_zip1v16qi
; break;
13515 case E_V8QImode
: gen
= gen_aarch64_zip1v8qi
; break;
13516 case E_V8HImode
: gen
= gen_aarch64_zip1v8hi
; break;
13517 case E_V4HImode
: gen
= gen_aarch64_zip1v4hi
; break;
13518 case E_V4SImode
: gen
= gen_aarch64_zip1v4si
; break;
13519 case E_V2SImode
: gen
= gen_aarch64_zip1v2si
; break;
13520 case E_V2DImode
: gen
= gen_aarch64_zip1v2di
; break;
13521 case E_V4HFmode
: gen
= gen_aarch64_zip1v4hf
; break;
13522 case E_V8HFmode
: gen
= gen_aarch64_zip1v8hf
; break;
13523 case E_V4SFmode
: gen
= gen_aarch64_zip1v4sf
; break;
13524 case E_V2SFmode
: gen
= gen_aarch64_zip1v2sf
; break;
13525 case E_V2DFmode
: gen
= gen_aarch64_zip1v2df
; break;
13531 emit_insn (gen (out
, in0
, in1
));
13535 /* Recognize patterns for the EXT insn. */
13538 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
13540 unsigned int i
, nelt
= d
->perm
.length ();
13541 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
13544 unsigned int location
= d
->perm
[0]; /* Always < nelt. */
13546 /* Check if the extracted indices are increasing by one. */
13547 for (i
= 1; i
< nelt
; i
++)
13549 unsigned int required
= location
+ i
;
13550 if (d
->one_vector_p
)
13552 /* We'll pass the same vector in twice, so allow indices to wrap. */
13553 required
&= (nelt
- 1);
13555 if (d
->perm
[i
] != required
)
13561 case E_V16QImode
: gen
= gen_aarch64_extv16qi
; break;
13562 case E_V8QImode
: gen
= gen_aarch64_extv8qi
; break;
13563 case E_V4HImode
: gen
= gen_aarch64_extv4hi
; break;
13564 case E_V8HImode
: gen
= gen_aarch64_extv8hi
; break;
13565 case E_V2SImode
: gen
= gen_aarch64_extv2si
; break;
13566 case E_V4SImode
: gen
= gen_aarch64_extv4si
; break;
13567 case E_V4HFmode
: gen
= gen_aarch64_extv4hf
; break;
13568 case E_V8HFmode
: gen
= gen_aarch64_extv8hf
; break;
13569 case E_V2SFmode
: gen
= gen_aarch64_extv2sf
; break;
13570 case E_V4SFmode
: gen
= gen_aarch64_extv4sf
; break;
13571 case E_V2DImode
: gen
= gen_aarch64_extv2di
; break;
13572 case E_V2DFmode
: gen
= gen_aarch64_extv2df
; break;
13581 /* The case where (location == 0) is a no-op for both big- and little-endian,
13582 and is removed by the mid-end at optimization levels -O1 and higher. */
13584 if (BYTES_BIG_ENDIAN
&& (location
!= 0))
13586 /* After setup, we want the high elements of the first vector (stored
13587 at the LSB end of the register), and the low elements of the second
13588 vector (stored at the MSB end of the register). So swap. */
13589 std::swap (d
->op0
, d
->op1
);
13590 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13591 location
= nelt
- location
;
13594 offset
= GEN_INT (location
);
13595 emit_insn (gen (d
->target
, d
->op0
, d
->op1
, offset
));
13599 /* Recognize patterns for the REV insns. */
13602 aarch64_evpc_rev (struct expand_vec_perm_d
*d
)
13604 unsigned int i
, j
, diff
, nelt
= d
->perm
.length ();
13605 rtx (*gen
) (rtx
, rtx
);
13607 if (!d
->one_vector_p
)
13616 case E_V16QImode
: gen
= gen_aarch64_rev64v16qi
; break;
13617 case E_V8QImode
: gen
= gen_aarch64_rev64v8qi
; break;
13625 case E_V16QImode
: gen
= gen_aarch64_rev32v16qi
; break;
13626 case E_V8QImode
: gen
= gen_aarch64_rev32v8qi
; break;
13627 case E_V8HImode
: gen
= gen_aarch64_rev64v8hi
; break;
13628 case E_V4HImode
: gen
= gen_aarch64_rev64v4hi
; break;
13636 case E_V16QImode
: gen
= gen_aarch64_rev16v16qi
; break;
13637 case E_V8QImode
: gen
= gen_aarch64_rev16v8qi
; break;
13638 case E_V8HImode
: gen
= gen_aarch64_rev32v8hi
; break;
13639 case E_V4HImode
: gen
= gen_aarch64_rev32v4hi
; break;
13640 case E_V4SImode
: gen
= gen_aarch64_rev64v4si
; break;
13641 case E_V2SImode
: gen
= gen_aarch64_rev64v2si
; break;
13642 case E_V4SFmode
: gen
= gen_aarch64_rev64v4sf
; break;
13643 case E_V2SFmode
: gen
= gen_aarch64_rev64v2sf
; break;
13644 case E_V8HFmode
: gen
= gen_aarch64_rev64v8hf
; break;
13645 case E_V4HFmode
: gen
= gen_aarch64_rev64v4hf
; break;
13654 for (i
= 0; i
< nelt
; i
+= diff
+ 1)
13655 for (j
= 0; j
<= diff
; j
+= 1)
13657 /* This is guaranteed to be true as the value of diff
13658 is 7, 3, 1 and we should have enough elements in the
13659 queue to generate this. Getting a vector mask with a
13660 value of diff other than these values implies that
13661 something is wrong by the time we get here. */
13662 gcc_assert (i
+ j
< nelt
);
13663 if (d
->perm
[i
+ j
] != i
+ diff
- j
)
13671 emit_insn (gen (d
->target
, d
->op0
));
13676 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
13678 rtx (*gen
) (rtx
, rtx
, rtx
);
13679 rtx out
= d
->target
;
13681 machine_mode vmode
= d
->vmode
;
13682 unsigned int i
, elt
, nelt
= d
->perm
.length ();
13686 for (i
= 1; i
< nelt
; i
++)
13688 if (elt
!= d
->perm
[i
])
13692 /* The generic preparation in aarch64_expand_vec_perm_const_1
13693 swaps the operand order and the permute indices if it finds
13694 d->perm[0] to be in the second operand. Thus, we can always
13695 use d->op0 and need not do any extra arithmetic to get the
13696 correct lane number. */
13698 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
13702 case E_V16QImode
: gen
= gen_aarch64_dup_lanev16qi
; break;
13703 case E_V8QImode
: gen
= gen_aarch64_dup_lanev8qi
; break;
13704 case E_V8HImode
: gen
= gen_aarch64_dup_lanev8hi
; break;
13705 case E_V4HImode
: gen
= gen_aarch64_dup_lanev4hi
; break;
13706 case E_V4SImode
: gen
= gen_aarch64_dup_lanev4si
; break;
13707 case E_V2SImode
: gen
= gen_aarch64_dup_lanev2si
; break;
13708 case E_V2DImode
: gen
= gen_aarch64_dup_lanev2di
; break;
13709 case E_V8HFmode
: gen
= gen_aarch64_dup_lanev8hf
; break;
13710 case E_V4HFmode
: gen
= gen_aarch64_dup_lanev4hf
; break;
13711 case E_V4SFmode
: gen
= gen_aarch64_dup_lanev4sf
; break;
13712 case E_V2SFmode
: gen
= gen_aarch64_dup_lanev2sf
; break;
13713 case E_V2DFmode
: gen
= gen_aarch64_dup_lanev2df
; break;
13718 emit_insn (gen (out
, in0
, lane
));
13723 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
13725 rtx rperm
[MAX_VECT_LEN
], sel
;
13726 machine_mode vmode
= d
->vmode
;
13727 unsigned int i
, nelt
= d
->perm
.length ();
13732 /* Generic code will try constant permutation twice. Once with the
13733 original mode and again with the elements lowered to QImode.
13734 So wait and don't do the selector expansion ourselves. */
13735 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
13738 for (i
= 0; i
< nelt
; ++i
)
13740 int nunits
= GET_MODE_NUNITS (vmode
);
13742 /* If big-endian and two vectors we end up with a weird mixed-endian
13743 mode on NEON. Reverse the index within each word but not the word
13745 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
? d
->perm
[i
] ^ (nunits
- 1)
13748 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
13749 sel
= force_reg (vmode
, sel
);
13751 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
13756 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
13758 /* The pattern matching functions above are written to look for a small
13759 number to begin the sequence (0, 1, N/2). If we begin with an index
13760 from the second operand, we can swap the operands. */
13761 unsigned int nelt
= d
->perm
.length ();
13762 if (d
->perm
[0] >= nelt
)
13764 gcc_assert (nelt
== (nelt
& -nelt
));
13765 for (unsigned int i
= 0; i
< nelt
; ++i
)
13766 d
->perm
[i
] ^= nelt
; /* Keep the same index, but in the other vector. */
13768 std::swap (d
->op0
, d
->op1
);
13773 if (aarch64_evpc_rev (d
))
13775 else if (aarch64_evpc_ext (d
))
13777 else if (aarch64_evpc_dup (d
))
13779 else if (aarch64_evpc_zip (d
))
13781 else if (aarch64_evpc_uzp (d
))
13783 else if (aarch64_evpc_trn (d
))
13785 return aarch64_evpc_tbl (d
);
13790 /* Expand a vec_perm_const pattern. */
13793 aarch64_expand_vec_perm_const (rtx target
, rtx op0
, rtx op1
, rtx sel
)
13795 struct expand_vec_perm_d d
;
13796 int i
, nelt
, which
;
13802 d
.vmode
= GET_MODE (target
);
13803 gcc_assert (VECTOR_MODE_P (d
.vmode
));
13804 d
.testing_p
= false;
13806 nelt
= GET_MODE_NUNITS (d
.vmode
);
13807 d
.perm
.reserve (nelt
);
13808 for (i
= which
= 0; i
< nelt
; ++i
)
13810 rtx e
= XVECEXP (sel
, 0, i
);
13811 int ei
= INTVAL (e
) & (2 * nelt
- 1);
13812 which
|= (ei
< nelt
? 1 : 2);
13813 d
.perm
.quick_push (ei
);
13819 gcc_unreachable ();
13822 d
.one_vector_p
= false;
13823 if (!rtx_equal_p (op0
, op1
))
13826 /* The elements of PERM do not suggest that only the first operand
13827 is used, but both operands are identical. Allow easier matching
13828 of the permutation by folding the permutation into the single
13830 /* Fall Through. */
13832 for (i
= 0; i
< nelt
; ++i
)
13833 d
.perm
[i
] &= nelt
- 1;
13835 d
.one_vector_p
= true;
13840 d
.one_vector_p
= true;
13844 return aarch64_expand_vec_perm_const_1 (&d
);
13848 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode
, vec_perm_indices sel
)
13850 struct expand_vec_perm_d d
;
13851 unsigned int i
, nelt
, which
;
13855 d
.testing_p
= true;
13856 d
.perm
.safe_splice (sel
);
13858 /* Calculate whether all elements are in one vector. */
13859 nelt
= sel
.length ();
13860 for (i
= which
= 0; i
< nelt
; ++i
)
13862 unsigned int e
= d
.perm
[i
];
13863 gcc_assert (e
< 2 * nelt
);
13864 which
|= (e
< nelt
? 1 : 2);
13867 /* If all elements are from the second vector, reindex as if from the
13870 for (i
= 0; i
< nelt
; ++i
)
13873 /* Check whether the mask can be applied to a single vector. */
13874 d
.one_vector_p
= (which
!= 3);
13876 d
.target
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 1);
13877 d
.op1
= d
.op0
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 2);
13878 if (!d
.one_vector_p
)
13879 d
.op1
= gen_raw_REG (d
.vmode
, LAST_VIRTUAL_REGISTER
+ 3);
13882 ret
= aarch64_expand_vec_perm_const_1 (&d
);
13889 aarch64_reverse_mask (machine_mode mode
)
13891 /* We have to reverse each vector because we dont have
13892 a permuted load that can reverse-load according to ABI rules. */
13894 rtvec v
= rtvec_alloc (16);
13896 int nunits
= GET_MODE_NUNITS (mode
);
13897 int usize
= GET_MODE_UNIT_SIZE (mode
);
13899 gcc_assert (BYTES_BIG_ENDIAN
);
13900 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
13902 for (i
= 0; i
< nunits
; i
++)
13903 for (j
= 0; j
< usize
; j
++)
13904 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
13905 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
13906 return force_reg (V16QImode
, mask
);
13909 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
13910 true. However due to issues with register allocation it is preferable
13911 to avoid tieing integer scalar and FP scalar modes. Executing integer
13912 operations in general registers is better than treating them as scalar
13913 vector operations. This reduces latency and avoids redundant int<->FP
13914 moves. So tie modes if they are either the same class, or vector modes
13915 with other vector modes, vector structs or any scalar mode. */
13918 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
13920 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
13923 /* We specifically want to allow elements of "structure" modes to
13924 be tieable to the structure. This more general condition allows
13925 other rarer situations too. */
13926 if (aarch64_vector_mode_p (mode1
) && aarch64_vector_mode_p (mode2
))
13929 /* Also allow any scalar modes with vectors. */
13930 if (aarch64_vector_mode_supported_p (mode1
)
13931 || aarch64_vector_mode_supported_p (mode2
))
13937 /* Return a new RTX holding the result of moving POINTER forward by
13941 aarch64_move_pointer (rtx pointer
, int amount
)
13943 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
13945 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
13949 /* Return a new RTX holding the result of moving POINTER forward by the
13950 size of the mode it points to. */
13953 aarch64_progress_pointer (rtx pointer
)
13955 HOST_WIDE_INT amount
= GET_MODE_SIZE (GET_MODE (pointer
));
13957 return aarch64_move_pointer (pointer
, amount
);
13960 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13964 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
13967 rtx reg
= gen_reg_rtx (mode
);
13969 /* "Cast" the pointers to the correct mode. */
13970 *src
= adjust_address (*src
, mode
, 0);
13971 *dst
= adjust_address (*dst
, mode
, 0);
13972 /* Emit the memcpy. */
13973 emit_move_insn (reg
, *src
);
13974 emit_move_insn (*dst
, reg
);
13975 /* Move the pointers forward. */
13976 *src
= aarch64_progress_pointer (*src
);
13977 *dst
= aarch64_progress_pointer (*dst
);
13980 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13981 we succeed, otherwise return false. */
13984 aarch64_expand_movmem (rtx
*operands
)
13987 rtx dst
= operands
[0];
13988 rtx src
= operands
[1];
13990 bool speed_p
= !optimize_function_for_size_p (cfun
);
13992 /* When optimizing for size, give a better estimate of the length of a
13993 memcpy call, but use the default otherwise. */
13994 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
13996 /* We can't do anything smart if the amount to copy is not constant. */
13997 if (!CONST_INT_P (operands
[2]))
14000 n
= UINTVAL (operands
[2]);
14002 /* Try to keep the number of instructions low. For cases below 16 bytes we
14003 need to make at most two moves. For cases above 16 bytes it will be one
14004 move for each 16 byte chunk, then at most two additional moves. */
14005 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
14008 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
14009 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
14011 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
14012 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
14014 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
14020 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
14025 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
14030 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
14031 4-byte chunk, partially overlapping with the previously copied chunk. */
14034 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
14040 src
= aarch64_move_pointer (src
, move
);
14041 dst
= aarch64_move_pointer (dst
, move
);
14042 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
14047 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
14048 them, then (if applicable) an 8-byte chunk. */
14053 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
14058 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
14063 /* Finish the final bytes of the copy. We can always do this in one
14064 instruction. We either copy the exact amount we need, or partially
14065 overlap with the previous chunk we copied and copy 8-bytes. */
14069 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
14071 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
14073 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
14078 src
= aarch64_move_pointer (src
, -1);
14079 dst
= aarch64_move_pointer (dst
, -1);
14080 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
14086 src
= aarch64_move_pointer (src
, move
);
14087 dst
= aarch64_move_pointer (dst
, move
);
14088 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
14095 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
14096 SImode stores. Handle the case when the constant has identical
14097 bottom and top halves. This is beneficial when the two stores can be
14098 merged into an STP and we avoid synthesising potentially expensive
14099 immediates twice. Return true if such a split is possible. */
14102 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
14104 rtx lo
= gen_lowpart (SImode
, src
);
14105 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
14107 bool size_p
= optimize_function_for_size_p (cfun
);
14109 if (!rtx_equal_p (lo
, hi
))
14112 unsigned int orig_cost
14113 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
14114 unsigned int lo_cost
14115 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
14117 /* We want to transform:
14119 MOVK x1, 0x140, lsl 16
14120 MOVK x1, 0xc0da, lsl 32
14121 MOVK x1, 0x140, lsl 48
14125 MOVK w1, 0x140, lsl 16
14127 So we want to perform this only when we save two instructions
14128 or more. When optimizing for size, however, accept any code size
14130 if (size_p
&& orig_cost
<= lo_cost
)
14134 && (orig_cost
<= lo_cost
+ 1))
14137 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
14138 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
14141 rtx tmp_reg
= gen_reg_rtx (SImode
);
14142 aarch64_expand_mov_immediate (tmp_reg
, lo
);
14143 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
14144 /* Don't emit an explicit store pair as this may not be always profitable.
14145 Let the sched-fusion logic decide whether to merge them. */
14146 emit_move_insn (mem_lo
, tmp_reg
);
14147 emit_move_insn (mem_hi
, tmp_reg
);
14152 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
14154 static unsigned HOST_WIDE_INT
14155 aarch64_asan_shadow_offset (void)
14157 return (HOST_WIDE_INT_1
<< 36);
14161 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size
,
14162 unsigned int align
,
14163 enum by_pieces_operation op
,
14166 /* STORE_BY_PIECES can be used when copying a constant string, but
14167 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
14168 For now we always fail this and let the move_by_pieces code copy
14169 the string from read-only memory. */
14170 if (op
== STORE_BY_PIECES
)
14173 return default_use_by_pieces_infrastructure_p (size
, align
, op
, speed_p
);
14177 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
14178 int code
, tree treeop0
, tree treeop1
)
14180 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
14182 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
14184 struct expand_operand ops
[4];
14187 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
14189 op_mode
= GET_MODE (op0
);
14190 if (op_mode
== VOIDmode
)
14191 op_mode
= GET_MODE (op1
);
14199 icode
= CODE_FOR_cmpsi
;
14204 icode
= CODE_FOR_cmpdi
;
14209 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
14210 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
14215 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
14216 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
14224 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
14225 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
14231 *prep_seq
= get_insns ();
14234 create_fixed_operand (&ops
[0], op0
);
14235 create_fixed_operand (&ops
[1], op1
);
14238 if (!maybe_expand_insn (icode
, 2, ops
))
14243 *gen_seq
= get_insns ();
14246 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
14247 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
14251 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
14252 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
14254 rtx op0
, op1
, target
;
14255 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
14256 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
14258 struct expand_operand ops
[6];
14261 push_to_sequence (*prep_seq
);
14262 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
14264 op_mode
= GET_MODE (op0
);
14265 if (op_mode
== VOIDmode
)
14266 op_mode
= GET_MODE (op1
);
14274 icode
= CODE_FOR_ccmpsi
;
14279 icode
= CODE_FOR_ccmpdi
;
14284 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
14285 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
14290 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
14291 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
14299 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
14300 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
14306 *prep_seq
= get_insns ();
14309 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
14310 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
14312 if (bit_code
!= AND
)
14314 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
14315 GET_MODE (XEXP (prev
, 0))),
14316 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
14317 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
14320 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
14321 create_fixed_operand (&ops
[1], target
);
14322 create_fixed_operand (&ops
[2], op0
);
14323 create_fixed_operand (&ops
[3], op1
);
14324 create_fixed_operand (&ops
[4], prev
);
14325 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
14327 push_to_sequence (*gen_seq
);
14328 if (!maybe_expand_insn (icode
, 6, ops
))
14334 *gen_seq
= get_insns ();
14337 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
14340 #undef TARGET_GEN_CCMP_FIRST
14341 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
14343 #undef TARGET_GEN_CCMP_NEXT
14344 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
14346 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
14347 instruction fusion of some sort. */
14350 aarch64_macro_fusion_p (void)
14352 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
14356 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
14357 should be kept together during scheduling. */
14360 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
14363 rtx prev_set
= single_set (prev
);
14364 rtx curr_set
= single_set (curr
);
14365 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
14366 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
14368 if (!aarch64_macro_fusion_p ())
14371 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
14373 /* We are trying to match:
14374 prev (mov) == (set (reg r0) (const_int imm16))
14375 curr (movk) == (set (zero_extract (reg r0)
14378 (const_int imm16_1)) */
14380 set_dest
= SET_DEST (curr_set
);
14382 if (GET_CODE (set_dest
) == ZERO_EXTRACT
14383 && CONST_INT_P (SET_SRC (curr_set
))
14384 && CONST_INT_P (SET_SRC (prev_set
))
14385 && CONST_INT_P (XEXP (set_dest
, 2))
14386 && INTVAL (XEXP (set_dest
, 2)) == 16
14387 && REG_P (XEXP (set_dest
, 0))
14388 && REG_P (SET_DEST (prev_set
))
14389 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
14395 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
14398 /* We're trying to match:
14399 prev (adrp) == (set (reg r1)
14400 (high (symbol_ref ("SYM"))))
14401 curr (add) == (set (reg r0)
14403 (symbol_ref ("SYM"))))
14404 Note that r0 need not necessarily be the same as r1, especially
14405 during pre-regalloc scheduling. */
14407 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
14408 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
14410 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
14411 && REG_P (XEXP (SET_SRC (curr_set
), 0))
14412 && REGNO (XEXP (SET_SRC (curr_set
), 0))
14413 == REGNO (SET_DEST (prev_set
))
14414 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
14415 XEXP (SET_SRC (curr_set
), 1)))
14420 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
14423 /* We're trying to match:
14424 prev (movk) == (set (zero_extract (reg r0)
14427 (const_int imm16_1))
14428 curr (movk) == (set (zero_extract (reg r0)
14431 (const_int imm16_2)) */
14433 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
14434 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
14435 && REG_P (XEXP (SET_DEST (prev_set
), 0))
14436 && REG_P (XEXP (SET_DEST (curr_set
), 0))
14437 && REGNO (XEXP (SET_DEST (prev_set
), 0))
14438 == REGNO (XEXP (SET_DEST (curr_set
), 0))
14439 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
14440 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
14441 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
14442 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
14443 && CONST_INT_P (SET_SRC (prev_set
))
14444 && CONST_INT_P (SET_SRC (curr_set
)))
14448 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
14450 /* We're trying to match:
14451 prev (adrp) == (set (reg r0)
14452 (high (symbol_ref ("SYM"))))
14453 curr (ldr) == (set (reg r1)
14454 (mem (lo_sum (reg r0)
14455 (symbol_ref ("SYM")))))
14457 curr (ldr) == (set (reg r1)
14460 (symbol_ref ("SYM")))))) */
14461 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
14462 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
14464 rtx curr_src
= SET_SRC (curr_set
);
14466 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
14467 curr_src
= XEXP (curr_src
, 0);
14469 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
14470 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
14471 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
14472 == REGNO (SET_DEST (prev_set
))
14473 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
14474 XEXP (SET_SRC (prev_set
), 0)))
14479 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
14480 && aarch_crypto_can_dual_issue (prev
, curr
))
14483 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
14484 && any_condjump_p (curr
))
14486 enum attr_type prev_type
= get_attr_type (prev
);
14488 unsigned int condreg1
, condreg2
;
14490 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
14491 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
14493 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
14495 && modified_in_p (cc_reg_1
, prev
))
14497 /* FIXME: this misses some which is considered simple arthematic
14498 instructions for ThunderX. Simple shifts are missed here. */
14499 if (prev_type
== TYPE_ALUS_SREG
14500 || prev_type
== TYPE_ALUS_IMM
14501 || prev_type
== TYPE_LOGICS_REG
14502 || prev_type
== TYPE_LOGICS_IMM
)
14509 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
14510 && any_condjump_p (curr
))
14512 /* We're trying to match:
14513 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
14514 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
14516 (label_ref ("SYM"))
14518 if (SET_DEST (curr_set
) == (pc_rtx
)
14519 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
14520 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
14521 && REG_P (SET_DEST (prev_set
))
14522 && REGNO (SET_DEST (prev_set
))
14523 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
14525 /* Fuse ALU operations followed by conditional branch instruction. */
14526 switch (get_attr_type (prev
))
14529 case TYPE_ALU_SREG
:
14532 case TYPE_ADCS_REG
:
14533 case TYPE_ADCS_IMM
:
14534 case TYPE_LOGIC_REG
:
14535 case TYPE_LOGIC_IMM
:
14539 case TYPE_SHIFT_REG
:
14540 case TYPE_SHIFT_IMM
:
14555 /* Return true iff the instruction fusion described by OP is enabled. */
14558 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
14560 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
14563 /* If MEM is in the form of [base+offset], extract the two parts
14564 of address and set to BASE and OFFSET, otherwise return false
14565 after clearing BASE and OFFSET. */
14568 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
14572 gcc_assert (MEM_P (mem
));
14574 addr
= XEXP (mem
, 0);
14579 *offset
= const0_rtx
;
14583 if (GET_CODE (addr
) == PLUS
14584 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
14586 *base
= XEXP (addr
, 0);
14587 *offset
= XEXP (addr
, 1);
14592 *offset
= NULL_RTX
;
14597 /* Types for scheduling fusion. */
14598 enum sched_fusion_type
14600 SCHED_FUSION_NONE
= 0,
14601 SCHED_FUSION_LD_SIGN_EXTEND
,
14602 SCHED_FUSION_LD_ZERO_EXTEND
,
14608 /* If INSN is a load or store of address in the form of [base+offset],
14609 extract the two parts and set to BASE and OFFSET. Return scheduling
14610 fusion type this INSN is. */
14612 static enum sched_fusion_type
14613 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
14616 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
14618 gcc_assert (INSN_P (insn
));
14619 x
= PATTERN (insn
);
14620 if (GET_CODE (x
) != SET
)
14621 return SCHED_FUSION_NONE
;
14624 dest
= SET_DEST (x
);
14626 machine_mode dest_mode
= GET_MODE (dest
);
14628 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
14629 return SCHED_FUSION_NONE
;
14631 if (GET_CODE (src
) == SIGN_EXTEND
)
14633 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
14634 src
= XEXP (src
, 0);
14635 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
14636 return SCHED_FUSION_NONE
;
14638 else if (GET_CODE (src
) == ZERO_EXTEND
)
14640 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
14641 src
= XEXP (src
, 0);
14642 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
14643 return SCHED_FUSION_NONE
;
14646 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
14647 extract_base_offset_in_addr (src
, base
, offset
);
14648 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
14650 fusion
= SCHED_FUSION_ST
;
14651 extract_base_offset_in_addr (dest
, base
, offset
);
14654 return SCHED_FUSION_NONE
;
14656 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
14657 fusion
= SCHED_FUSION_NONE
;
14662 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14664 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14665 and PRI are only calculated for these instructions. For other instruction,
14666 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14667 type instruction fusion can be added by returning different priorities.
14669 It's important that irrelevant instructions get the largest FUSION_PRI. */
14672 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
14673 int *fusion_pri
, int *pri
)
14677 enum sched_fusion_type fusion
;
14679 gcc_assert (INSN_P (insn
));
14682 fusion
= fusion_load_store (insn
, &base
, &offset
);
14683 if (fusion
== SCHED_FUSION_NONE
)
14690 /* Set FUSION_PRI according to fusion type and base register. */
14691 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
14693 /* Calculate PRI. */
14696 /* INSN with smaller offset goes first. */
14697 off_val
= (int)(INTVAL (offset
));
14699 tmp
-= (off_val
& 0xfffff);
14701 tmp
+= ((- off_val
) & 0xfffff);
14707 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14708 Adjust priority of sha1h instructions so they are scheduled before
14709 other SHA1 instructions. */
14712 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
14714 rtx x
= PATTERN (insn
);
14716 if (GET_CODE (x
) == SET
)
14720 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
14721 return priority
+ 10;
14727 /* Given OPERANDS of consecutive load/store, check if we can merge
14728 them into ldp/stp. LOAD is true if they are load instructions.
14729 MODE is the mode of memory operands. */
14732 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
14735 HOST_WIDE_INT offval_1
, offval_2
, msize
;
14736 enum reg_class rclass_1
, rclass_2
;
14737 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
14741 mem_1
= operands
[1];
14742 mem_2
= operands
[3];
14743 reg_1
= operands
[0];
14744 reg_2
= operands
[2];
14745 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
14746 if (REGNO (reg_1
) == REGNO (reg_2
))
14751 mem_1
= operands
[0];
14752 mem_2
= operands
[2];
14753 reg_1
= operands
[1];
14754 reg_2
= operands
[3];
14757 /* The mems cannot be volatile. */
14758 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
14761 /* If we have SImode and slow unaligned ldp,
14762 check the alignment to be at least 8 byte. */
14764 && (aarch64_tune_params
.extra_tuning_flags
14765 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
14767 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
14770 /* Check if the addresses are in the form of [base+offset]. */
14771 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
14772 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
14774 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
14775 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
14778 /* Check if the bases are same. */
14779 if (!rtx_equal_p (base_1
, base_2
))
14782 offval_1
= INTVAL (offset_1
);
14783 offval_2
= INTVAL (offset_2
);
14784 msize
= GET_MODE_SIZE (mode
);
14785 /* Check if the offsets are consecutive. */
14786 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
14789 /* Check if the addresses are clobbered by load. */
14792 if (reg_mentioned_p (reg_1
, mem_1
))
14795 /* In increasing order, the last load can clobber the address. */
14796 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
14800 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
14801 rclass_1
= FP_REGS
;
14803 rclass_1
= GENERAL_REGS
;
14805 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
14806 rclass_2
= FP_REGS
;
14808 rclass_2
= GENERAL_REGS
;
14810 /* Check if the registers are of same class. */
14811 if (rclass_1
!= rclass_2
)
14817 /* Given OPERANDS of consecutive load/store, check if we can merge
14818 them into ldp/stp by adjusting the offset. LOAD is true if they
14819 are load instructions. MODE is the mode of memory operands.
14821 Given below consecutive stores:
14823 str w1, [xb, 0x100]
14824 str w1, [xb, 0x104]
14825 str w1, [xb, 0x108]
14826 str w1, [xb, 0x10c]
14828 Though the offsets are out of the range supported by stp, we can
14829 still pair them after adjusting the offset, like:
14831 add scratch, xb, 0x100
14832 stp w1, w1, [scratch]
14833 stp w1, w1, [scratch, 0x8]
14835 The peephole patterns detecting this opportunity should guarantee
14836 the scratch register is avaliable. */
14839 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
14842 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
14843 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
14844 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
14845 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
14849 reg_1
= operands
[0];
14850 mem_1
= operands
[1];
14851 reg_2
= operands
[2];
14852 mem_2
= operands
[3];
14853 reg_3
= operands
[4];
14854 mem_3
= operands
[5];
14855 reg_4
= operands
[6];
14856 mem_4
= operands
[7];
14857 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
14858 && REG_P (reg_3
) && REG_P (reg_4
));
14859 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
14864 mem_1
= operands
[0];
14865 reg_1
= operands
[1];
14866 mem_2
= operands
[2];
14867 reg_2
= operands
[3];
14868 mem_3
= operands
[4];
14869 reg_3
= operands
[5];
14870 mem_4
= operands
[6];
14871 reg_4
= operands
[7];
14873 /* Skip if memory operand is by itslef valid for ldp/stp. */
14874 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
14877 /* The mems cannot be volatile. */
14878 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
14879 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
14882 /* Check if the addresses are in the form of [base+offset]. */
14883 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
14884 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
14886 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
14887 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
14889 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
14890 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
14892 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
14893 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
14896 /* Check if the bases are same. */
14897 if (!rtx_equal_p (base_1
, base_2
)
14898 || !rtx_equal_p (base_2
, base_3
)
14899 || !rtx_equal_p (base_3
, base_4
))
14902 offval_1
= INTVAL (offset_1
);
14903 offval_2
= INTVAL (offset_2
);
14904 offval_3
= INTVAL (offset_3
);
14905 offval_4
= INTVAL (offset_4
);
14906 msize
= GET_MODE_SIZE (mode
);
14907 /* Check if the offsets are consecutive. */
14908 if ((offval_1
!= (offval_2
+ msize
)
14909 || offval_1
!= (offval_3
+ msize
* 2)
14910 || offval_1
!= (offval_4
+ msize
* 3))
14911 && (offval_4
!= (offval_3
+ msize
)
14912 || offval_4
!= (offval_2
+ msize
* 2)
14913 || offval_4
!= (offval_1
+ msize
* 3)))
14916 /* Check if the addresses are clobbered by load. */
14919 if (reg_mentioned_p (reg_1
, mem_1
)
14920 || reg_mentioned_p (reg_2
, mem_2
)
14921 || reg_mentioned_p (reg_3
, mem_3
))
14924 /* In increasing order, the last load can clobber the address. */
14925 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
14929 /* If we have SImode and slow unaligned ldp,
14930 check the alignment to be at least 8 byte. */
14932 && (aarch64_tune_params
.extra_tuning_flags
14933 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
14935 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
14938 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
14939 rclass_1
= FP_REGS
;
14941 rclass_1
= GENERAL_REGS
;
14943 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
14944 rclass_2
= FP_REGS
;
14946 rclass_2
= GENERAL_REGS
;
14948 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
14949 rclass_3
= FP_REGS
;
14951 rclass_3
= GENERAL_REGS
;
14953 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
14954 rclass_4
= FP_REGS
;
14956 rclass_4
= GENERAL_REGS
;
14958 /* Check if the registers are of same class. */
14959 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
14965 /* Given OPERANDS of consecutive load/store, this function pairs them
14966 into ldp/stp after adjusting the offset. It depends on the fact
14967 that addresses of load/store instructions are in increasing order.
14968 MODE is the mode of memory operands. CODE is the rtl operator
14969 which should be applied to all memory operands, it's SIGN_EXTEND,
14970 ZERO_EXTEND or UNKNOWN. */
14973 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
14974 scalar_mode mode
, RTX_CODE code
)
14976 rtx base
, offset
, t1
, t2
;
14977 rtx mem_1
, mem_2
, mem_3
, mem_4
;
14978 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
14982 mem_1
= operands
[1];
14983 mem_2
= operands
[3];
14984 mem_3
= operands
[5];
14985 mem_4
= operands
[7];
14989 mem_1
= operands
[0];
14990 mem_2
= operands
[2];
14991 mem_3
= operands
[4];
14992 mem_4
= operands
[6];
14993 gcc_assert (code
== UNKNOWN
);
14996 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
14997 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
14999 /* Adjust offset thus it can fit in ldp/stp instruction. */
15000 msize
= GET_MODE_SIZE (mode
);
15001 stp_off_limit
= msize
* 0x40;
15002 off_val
= INTVAL (offset
);
15003 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
15004 new_off
= abs_off
% stp_off_limit
;
15005 adj_off
= abs_off
- new_off
;
15007 /* Further adjust to make sure all offsets are OK. */
15008 if ((new_off
+ msize
* 2) >= stp_off_limit
)
15010 adj_off
+= stp_off_limit
;
15011 new_off
-= stp_off_limit
;
15014 /* Make sure the adjustment can be done with ADD/SUB instructions. */
15015 if (adj_off
>= 0x1000)
15020 adj_off
= -adj_off
;
15021 new_off
= -new_off
;
15024 /* Create new memory references. */
15025 mem_1
= change_address (mem_1
, VOIDmode
,
15026 plus_constant (DImode
, operands
[8], new_off
));
15028 /* Check if the adjusted address is OK for ldp/stp. */
15029 if (!aarch64_mem_pair_operand (mem_1
, mode
))
15032 msize
= GET_MODE_SIZE (mode
);
15033 mem_2
= change_address (mem_2
, VOIDmode
,
15034 plus_constant (DImode
,
15037 mem_3
= change_address (mem_3
, VOIDmode
,
15038 plus_constant (DImode
,
15040 new_off
+ msize
* 2));
15041 mem_4
= change_address (mem_4
, VOIDmode
,
15042 plus_constant (DImode
,
15044 new_off
+ msize
* 3));
15046 if (code
== ZERO_EXTEND
)
15048 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
15049 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
15050 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
15051 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
15053 else if (code
== SIGN_EXTEND
)
15055 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
15056 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
15057 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
15058 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
15063 operands
[1] = mem_1
;
15064 operands
[3] = mem_2
;
15065 operands
[5] = mem_3
;
15066 operands
[7] = mem_4
;
15070 operands
[0] = mem_1
;
15071 operands
[2] = mem_2
;
15072 operands
[4] = mem_3
;
15073 operands
[6] = mem_4
;
15076 /* Emit adjusting instruction. */
15077 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, adj_off
)));
15078 /* Emit ldp/stp instructions. */
15079 t1
= gen_rtx_SET (operands
[0], operands
[1]);
15080 t2
= gen_rtx_SET (operands
[2], operands
[3]);
15081 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
15082 t1
= gen_rtx_SET (operands
[4], operands
[5]);
15083 t2
= gen_rtx_SET (operands
[6], operands
[7]);
15084 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
15088 /* Return 1 if pseudo register should be created and used to hold
15089 GOT address for PIC code. */
15092 aarch64_use_pseudo_pic_reg (void)
15094 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
15097 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
15100 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
15102 switch (XINT (x
, 1))
15104 case UNSPEC_GOTSMALLPIC
:
15105 case UNSPEC_GOTSMALLPIC28K
:
15106 case UNSPEC_GOTTINYPIC
:
15112 return default_unspec_may_trap_p (x
, flags
);
15116 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
15117 return the log2 of that value. Otherwise return -1. */
15120 aarch64_fpconst_pow_of_2 (rtx x
)
15122 const REAL_VALUE_TYPE
*r
;
15124 if (!CONST_DOUBLE_P (x
))
15127 r
= CONST_DOUBLE_REAL_VALUE (x
);
15129 if (REAL_VALUE_NEGATIVE (*r
)
15130 || REAL_VALUE_ISNAN (*r
)
15131 || REAL_VALUE_ISINF (*r
)
15132 || !real_isinteger (r
, DFmode
))
15135 return exact_log2 (real_to_integer (r
));
15138 /* If X is a vector of equal CONST_DOUBLE values and that value is
15139 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
15142 aarch64_vec_fpconst_pow_of_2 (rtx x
)
15144 if (GET_CODE (x
) != CONST_VECTOR
)
15147 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
15150 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
15154 for (int i
= 1; i
< CONST_VECTOR_NUNITS (x
); i
++)
15155 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
15161 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
15164 __fp16 always promotes through this hook.
15165 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
15166 through the generic excess precision logic rather than here. */
15169 aarch64_promoted_type (const_tree t
)
15171 if (SCALAR_FLOAT_TYPE_P (t
)
15172 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
15173 return float_type_node
;
15178 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
15181 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
15182 optimization_type opt_type
)
15187 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
15194 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
15195 if MODE is HFmode, and punt to the generic implementation otherwise. */
15198 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
15200 return (mode
== HFmode
15202 : default_libgcc_floating_mode_supported_p (mode
));
15205 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
15206 if MODE is HFmode, and punt to the generic implementation otherwise. */
15209 aarch64_scalar_mode_supported_p (scalar_mode mode
)
15211 return (mode
== HFmode
15213 : default_scalar_mode_supported_p (mode
));
15216 /* Set the value of FLT_EVAL_METHOD.
15217 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
15219 0: evaluate all operations and constants, whose semantic type has at
15220 most the range and precision of type float, to the range and
15221 precision of float; evaluate all other operations and constants to
15222 the range and precision of the semantic type;
15224 N, where _FloatN is a supported interchange floating type
15225 evaluate all operations and constants, whose semantic type has at
15226 most the range and precision of _FloatN type, to the range and
15227 precision of the _FloatN type; evaluate all other operations and
15228 constants to the range and precision of the semantic type;
15230 If we have the ARMv8.2-A extensions then we support _Float16 in native
15231 precision, so we should set this to 16. Otherwise, we support the type,
15232 but want to evaluate expressions in float precision, so set this to
15235 static enum flt_eval_method
15236 aarch64_excess_precision (enum excess_precision_type type
)
15240 case EXCESS_PRECISION_TYPE_FAST
:
15241 case EXCESS_PRECISION_TYPE_STANDARD
:
15242 /* We can calculate either in 16-bit range and precision or
15243 32-bit range and precision. Make that decision based on whether
15244 we have native support for the ARMv8.2-A 16-bit floating-point
15245 instructions or not. */
15246 return (TARGET_FP_F16INST
15247 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
15248 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
15249 case EXCESS_PRECISION_TYPE_IMPLICIT
:
15250 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
15252 gcc_unreachable ();
15254 return FLT_EVAL_METHOD_UNPREDICTABLE
;
15257 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
15258 scheduled for speculative execution. Reject the long-running division
15259 and square-root instructions. */
15262 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
15264 switch (get_attr_type (insn
))
15272 case TYPE_NEON_FP_SQRT_S
:
15273 case TYPE_NEON_FP_SQRT_D
:
15274 case TYPE_NEON_FP_SQRT_S_Q
:
15275 case TYPE_NEON_FP_SQRT_D_Q
:
15276 case TYPE_NEON_FP_DIV_S
:
15277 case TYPE_NEON_FP_DIV_D
:
15278 case TYPE_NEON_FP_DIV_S_Q
:
15279 case TYPE_NEON_FP_DIV_D_Q
:
15286 /* Target-specific selftests. */
15290 namespace selftest
{
15292 /* Selftest for the RTL loader.
15293 Verify that the RTL loader copes with a dump from
15294 print_rtx_function. This is essentially just a test that class
15295 function_reader can handle a real dump, but it also verifies
15296 that lookup_reg_by_dump_name correctly handles hard regs.
15297 The presence of hard reg names in the dump means that the test is
15298 target-specific, hence it is in this file. */
15301 aarch64_test_loading_full_dump ()
15303 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
15305 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
15307 rtx_insn
*insn_1
= get_insn_by_uid (1);
15308 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
15310 rtx_insn
*insn_15
= get_insn_by_uid (15);
15311 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
15312 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
15314 /* Verify crtl->return_rtx. */
15315 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
15316 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
15317 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
15320 /* Run all target-specific selftests. */
15323 aarch64_run_selftests (void)
15325 aarch64_test_loading_full_dump ();
15328 } // namespace selftest
15330 #endif /* #if CHECKING_P */
15332 #undef TARGET_ADDRESS_COST
15333 #define TARGET_ADDRESS_COST aarch64_address_cost
15335 /* This hook will determines whether unnamed bitfields affect the alignment
15336 of the containing structure. The hook returns true if the structure
15337 should inherit the alignment requirements of an unnamed bitfield's
15339 #undef TARGET_ALIGN_ANON_BITFIELD
15340 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
15342 #undef TARGET_ASM_ALIGNED_DI_OP
15343 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
15345 #undef TARGET_ASM_ALIGNED_HI_OP
15346 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
15348 #undef TARGET_ASM_ALIGNED_SI_OP
15349 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
15351 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
15352 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
15353 hook_bool_const_tree_hwi_hwi_const_tree_true
15355 #undef TARGET_ASM_FILE_START
15356 #define TARGET_ASM_FILE_START aarch64_start_file
15358 #undef TARGET_ASM_OUTPUT_MI_THUNK
15359 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
15361 #undef TARGET_ASM_SELECT_RTX_SECTION
15362 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
15364 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
15365 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
15367 #undef TARGET_BUILD_BUILTIN_VA_LIST
15368 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
15370 #undef TARGET_CALLEE_COPIES
15371 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
15373 #undef TARGET_CAN_ELIMINATE
15374 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
15376 #undef TARGET_CAN_INLINE_P
15377 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
15379 #undef TARGET_CANNOT_FORCE_CONST_MEM
15380 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
15382 #undef TARGET_CASE_VALUES_THRESHOLD
15383 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
15385 #undef TARGET_CONDITIONAL_REGISTER_USAGE
15386 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
15388 /* Only the least significant bit is used for initialization guard
15390 #undef TARGET_CXX_GUARD_MASK_BIT
15391 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
15393 #undef TARGET_C_MODE_FOR_SUFFIX
15394 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
15396 #ifdef TARGET_BIG_ENDIAN_DEFAULT
15397 #undef TARGET_DEFAULT_TARGET_FLAGS
15398 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
15401 #undef TARGET_CLASS_MAX_NREGS
15402 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
15404 #undef TARGET_BUILTIN_DECL
15405 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
15407 #undef TARGET_BUILTIN_RECIPROCAL
15408 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
15410 #undef TARGET_C_EXCESS_PRECISION
15411 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
15413 #undef TARGET_EXPAND_BUILTIN
15414 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
15416 #undef TARGET_EXPAND_BUILTIN_VA_START
15417 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
15419 #undef TARGET_FOLD_BUILTIN
15420 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
15422 #undef TARGET_FUNCTION_ARG
15423 #define TARGET_FUNCTION_ARG aarch64_function_arg
15425 #undef TARGET_FUNCTION_ARG_ADVANCE
15426 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
15428 #undef TARGET_FUNCTION_ARG_BOUNDARY
15429 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
15431 #undef TARGET_FUNCTION_ARG_PADDING
15432 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
15434 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
15435 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
15437 #undef TARGET_FUNCTION_VALUE
15438 #define TARGET_FUNCTION_VALUE aarch64_function_value
15440 #undef TARGET_FUNCTION_VALUE_REGNO_P
15441 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
15443 #undef TARGET_FRAME_POINTER_REQUIRED
15444 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
15446 #undef TARGET_GIMPLE_FOLD_BUILTIN
15447 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
15449 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
15450 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
15452 #undef TARGET_INIT_BUILTINS
15453 #define TARGET_INIT_BUILTINS aarch64_init_builtins
15455 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
15456 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
15457 aarch64_ira_change_pseudo_allocno_class
15459 #undef TARGET_LEGITIMATE_ADDRESS_P
15460 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
15462 #undef TARGET_LEGITIMATE_CONSTANT_P
15463 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
15465 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
15466 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
15467 aarch64_legitimize_address_displacement
15469 #undef TARGET_LIBGCC_CMP_RETURN_MODE
15470 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
15472 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
15473 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
15474 aarch64_libgcc_floating_mode_supported_p
15476 #undef TARGET_MANGLE_TYPE
15477 #define TARGET_MANGLE_TYPE aarch64_mangle_type
15479 #undef TARGET_MEMORY_MOVE_COST
15480 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
15482 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
15483 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
15485 #undef TARGET_MUST_PASS_IN_STACK
15486 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
15488 /* This target hook should return true if accesses to volatile bitfields
15489 should use the narrowest mode possible. It should return false if these
15490 accesses should use the bitfield container type. */
15491 #undef TARGET_NARROW_VOLATILE_BITFIELD
15492 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
15494 #undef TARGET_OPTION_OVERRIDE
15495 #define TARGET_OPTION_OVERRIDE aarch64_override_options
15497 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
15498 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
15499 aarch64_override_options_after_change
15501 #undef TARGET_OPTION_SAVE
15502 #define TARGET_OPTION_SAVE aarch64_option_save
15504 #undef TARGET_OPTION_RESTORE
15505 #define TARGET_OPTION_RESTORE aarch64_option_restore
15507 #undef TARGET_OPTION_PRINT
15508 #define TARGET_OPTION_PRINT aarch64_option_print
15510 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
15511 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
15513 #undef TARGET_SET_CURRENT_FUNCTION
15514 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
15516 #undef TARGET_PASS_BY_REFERENCE
15517 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
15519 #undef TARGET_PREFERRED_RELOAD_CLASS
15520 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15522 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15523 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15525 #undef TARGET_PROMOTED_TYPE
15526 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15528 #undef TARGET_SECONDARY_RELOAD
15529 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15531 #undef TARGET_SHIFT_TRUNCATION_MASK
15532 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15534 #undef TARGET_SETUP_INCOMING_VARARGS
15535 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15537 #undef TARGET_STRUCT_VALUE_RTX
15538 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15540 #undef TARGET_REGISTER_MOVE_COST
15541 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15543 #undef TARGET_RETURN_IN_MEMORY
15544 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15546 #undef TARGET_RETURN_IN_MSB
15547 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15549 #undef TARGET_RTX_COSTS
15550 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15552 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15553 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15555 #undef TARGET_SCHED_ISSUE_RATE
15556 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15558 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15559 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15560 aarch64_sched_first_cycle_multipass_dfa_lookahead
15562 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15563 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15564 aarch64_first_cycle_multipass_dfa_lookahead_guard
15566 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15567 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15568 aarch64_get_separate_components
15570 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15571 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15572 aarch64_components_for_bb
15574 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15575 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15576 aarch64_disqualify_components
15578 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15579 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15580 aarch64_emit_prologue_components
15582 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15583 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15584 aarch64_emit_epilogue_components
15586 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15587 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15588 aarch64_set_handled_components
15590 #undef TARGET_TRAMPOLINE_INIT
15591 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15593 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15594 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15596 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15597 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15599 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15600 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15601 aarch64_builtin_support_vector_misalignment
15603 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15604 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15606 #undef TARGET_VECTORIZE_ADD_STMT_COST
15607 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15609 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15610 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15611 aarch64_builtin_vectorization_cost
15613 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15614 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15616 #undef TARGET_VECTORIZE_BUILTINS
15617 #define TARGET_VECTORIZE_BUILTINS
15619 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15620 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15621 aarch64_builtin_vectorized_function
15623 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15624 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15625 aarch64_autovectorize_vector_sizes
15627 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15628 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15629 aarch64_atomic_assign_expand_fenv
15631 /* Section anchor support. */
15633 #undef TARGET_MIN_ANCHOR_OFFSET
15634 #define TARGET_MIN_ANCHOR_OFFSET -256
15636 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15637 byte offset; we can do much more for larger data types, but have no way
15638 to determine the size of the access. We assume accesses are aligned. */
15639 #undef TARGET_MAX_ANCHOR_OFFSET
15640 #define TARGET_MAX_ANCHOR_OFFSET 4095
15642 #undef TARGET_VECTOR_ALIGNMENT
15643 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15645 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15646 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15647 aarch64_simd_vector_alignment_reachable
15649 /* vec_perm support. */
15651 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15652 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15653 aarch64_vectorize_vec_perm_const_ok
15655 #undef TARGET_INIT_LIBFUNCS
15656 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15658 #undef TARGET_FIXED_CONDITION_CODE_REGS
15659 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15661 #undef TARGET_FLAGS_REGNUM
15662 #define TARGET_FLAGS_REGNUM CC_REGNUM
15664 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15665 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15667 #undef TARGET_ASAN_SHADOW_OFFSET
15668 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15670 #undef TARGET_LEGITIMIZE_ADDRESS
15671 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15673 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15674 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15675 aarch64_use_by_pieces_infrastructure_p
15677 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15678 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15680 #undef TARGET_CAN_USE_DOLOOP_P
15681 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15683 #undef TARGET_SCHED_ADJUST_PRIORITY
15684 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15686 #undef TARGET_SCHED_MACRO_FUSION_P
15687 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15689 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15690 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15692 #undef TARGET_SCHED_FUSION_PRIORITY
15693 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15695 #undef TARGET_UNSPEC_MAY_TRAP_P
15696 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15698 #undef TARGET_USE_PSEUDO_PIC_REG
15699 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15701 #undef TARGET_PRINT_OPERAND
15702 #define TARGET_PRINT_OPERAND aarch64_print_operand
15704 #undef TARGET_PRINT_OPERAND_ADDRESS
15705 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15707 #undef TARGET_OPTAB_SUPPORTED_P
15708 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15710 #undef TARGET_OMIT_STRUCT_RETURN_REG
15711 #define TARGET_OMIT_STRUCT_RETURN_REG true
15713 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15714 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15715 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15717 #undef TARGET_HARD_REGNO_NREGS
15718 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
15719 #undef TARGET_HARD_REGNO_MODE_OK
15720 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
15722 #undef TARGET_MODES_TIEABLE_P
15723 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
15725 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
15726 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
15727 aarch64_hard_regno_call_part_clobbered
15729 #undef TARGET_CONSTANT_ALIGNMENT
15730 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
15733 #undef TARGET_RUN_TARGET_SELFTESTS
15734 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15735 #endif /* #if CHECKING_P */
15737 struct gcc_target targetm
= TARGET_INITIALIZER
;
15739 #include "gt-aarch64.h"