1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
84 A simple base register plus immediate offset.
87 A base register indexed by immediate offset with writeback.
90 A base register indexed by (optionally scaled) register.
93 A base register indexed by (optionally scaled) zero-extended register.
96 A base register indexed by (optionally scaled) sign-extended register.
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type
{
114 struct aarch64_address_info
{
115 enum aarch64_address_type type
;
118 poly_int64 const_offset
;
120 enum aarch64_symbol_type symbol_type
;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type
{ MOV
, MVN
};
127 enum modifier_type
{ LSL
, MSL
};
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode
, rtx
);
131 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
132 insn_type
= MOV
, modifier_type
= LSL
,
134 simd_immediate_info (scalar_mode
, rtx
, rtx
);
136 /* The mode of the elements. */
137 scalar_mode elt_mode
;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
143 /* The value of the step if the constant is a series, null otherwise. */
146 /* The instruction to use to move the immediate into a vector. */
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier
;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
159 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
160 modifier (LSL
), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
168 unsigned HOST_WIDE_INT value_in
,
169 insn_type insn_in
, modifier_type modifier_in
,
170 unsigned int shift_in
)
171 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
172 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
179 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
180 modifier (LSL
), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel
;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg
;
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
194 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
197 machine_mode
*, int *,
199 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
200 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode
);
203 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
208 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode
, rtx
);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version
;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune
= cortexa53
;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags
= 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads
;
223 /* Support for command line parsing of boolean flags in the tuning
225 struct aarch64_flag_desc
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
235 { "none", AARCH64_FUSE_NOTHING
},
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL
},
238 { NULL
, AARCH64_FUSE_NOTHING
}
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
245 { "none", AARCH64_EXTRA_TUNE_NONE
},
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL
},
248 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
251 /* Tuning parameters. */
253 static const struct cpu_addrcost_table generic_addrcost_table
=
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
269 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
285 static const struct cpu_addrcost_table xgene1_addrcost_table
=
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
317 static const struct cpu_regmove_cost generic_regmove_cost
=
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
327 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
337 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
347 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
357 static const struct cpu_regmove_cost thunderx_regmove_cost
=
365 static const struct cpu_regmove_cost xgene1_regmove_cost
=
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
378 /* Avoid the use of int<->fp moves for spilling. */
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
387 /* Avoid the use of int<->fp moves for spilling. */
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost
=
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost
=
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost
=
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
453 static const struct cpu_vector_cost exynosm1_vector_cost
=
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost
=
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost
=
515 1, /* Predictable. */
516 3 /* Unpredictable. */
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes
=
522 AARCH64_APPROX_NONE
, /* division */
523 AARCH64_APPROX_NONE
, /* sqrt */
524 AARCH64_APPROX_NONE
/* recip_sqrt */
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes
=
530 AARCH64_APPROX_NONE
, /* division */
531 AARCH64_APPROX_ALL
, /* sqrt */
532 AARCH64_APPROX_ALL
/* recip_sqrt */
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes
=
538 AARCH64_APPROX_NONE
, /* division */
539 AARCH64_APPROX_NONE
, /* sqrt */
540 AARCH64_APPROX_ALL
/* recip_sqrt */
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune
=
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
553 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 1024, /* l2_cache_size */
568 -1 /* default_opt_level */
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
580 static const cpu_prefetch_tune thunderx_prefetch_tune
=
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
598 static const struct tune_params generic_tunings
=
600 &cortexa57_extra_costs
,
601 &generic_addrcost_table
,
602 &generic_regmove_cost
,
603 &generic_vector_cost
,
604 &generic_branch_cost
,
605 &generic_approx_modes
,
608 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
609 8, /* function_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
620 &generic_prefetch_tune
623 static const struct tune_params cortexa35_tunings
=
625 &cortexa53_extra_costs
,
626 &generic_addrcost_table
,
627 &cortexa53_regmove_cost
,
628 &generic_vector_cost
,
629 &generic_branch_cost
,
630 &generic_approx_modes
,
633 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
635 16, /* function_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
646 &generic_prefetch_tune
649 static const struct tune_params cortexa53_tunings
=
651 &cortexa53_extra_costs
,
652 &generic_addrcost_table
,
653 &cortexa53_regmove_cost
,
654 &generic_vector_cost
,
655 &generic_branch_cost
,
656 &generic_approx_modes
,
659 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
661 16, /* function_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
672 &generic_prefetch_tune
675 static const struct tune_params cortexa57_tunings
=
677 &cortexa57_extra_costs
,
678 &generic_addrcost_table
,
679 &cortexa57_regmove_cost
,
680 &cortexa57_vector_cost
,
681 &generic_branch_cost
,
682 &generic_approx_modes
,
685 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
687 16, /* function_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
698 &generic_prefetch_tune
701 static const struct tune_params cortexa72_tunings
=
703 &cortexa57_extra_costs
,
704 &generic_addrcost_table
,
705 &cortexa57_regmove_cost
,
706 &cortexa57_vector_cost
,
707 &generic_branch_cost
,
708 &generic_approx_modes
,
711 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
713 16, /* function_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
724 &generic_prefetch_tune
727 static const struct tune_params cortexa73_tunings
=
729 &cortexa57_extra_costs
,
730 &generic_addrcost_table
,
731 &cortexa57_regmove_cost
,
732 &cortexa57_vector_cost
,
733 &generic_branch_cost
,
734 &generic_approx_modes
,
735 4, /* memmov_cost. */
737 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
739 16, /* function_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
750 &generic_prefetch_tune
755 static const struct tune_params exynosm1_tunings
=
757 &exynosm1_extra_costs
,
758 &exynosm1_addrcost_table
,
759 &exynosm1_regmove_cost
,
760 &exynosm1_vector_cost
,
761 &generic_branch_cost
,
762 &exynosm1_approx_modes
,
765 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
766 4, /* function_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
777 &exynosm1_prefetch_tune
780 static const struct tune_params thunderxt88_tunings
=
782 &thunderx_extra_costs
,
783 &generic_addrcost_table
,
784 &thunderx_regmove_cost
,
785 &thunderx_vector_cost
,
786 &generic_branch_cost
,
787 &generic_approx_modes
,
790 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
791 8, /* function_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
802 &thunderxt88_prefetch_tune
805 static const struct tune_params thunderx_tunings
=
807 &thunderx_extra_costs
,
808 &generic_addrcost_table
,
809 &thunderx_regmove_cost
,
810 &thunderx_vector_cost
,
811 &generic_branch_cost
,
812 &generic_approx_modes
,
815 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
816 8, /* function_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
828 &thunderx_prefetch_tune
831 static const struct tune_params xgene1_tunings
=
834 &xgene1_addrcost_table
,
835 &xgene1_regmove_cost
,
837 &generic_branch_cost
,
838 &xgene1_approx_modes
,
841 AARCH64_FUSE_NOTHING
, /* fusible_ops */
842 16, /* function_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
853 &generic_prefetch_tune
856 static const struct tune_params qdf24xx_tunings
=
858 &qdf24xx_extra_costs
,
859 &generic_addrcost_table
,
860 &qdf24xx_regmove_cost
,
861 &generic_vector_cost
,
862 &generic_branch_cost
,
863 &generic_approx_modes
,
866 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
868 16, /* function_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
879 &qdf24xx_prefetch_tune
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
884 static const struct tune_params saphira_tunings
=
886 &generic_extra_costs
,
887 &generic_addrcost_table
,
888 &generic_regmove_cost
,
889 &generic_vector_cost
,
890 &generic_branch_cost
,
891 &generic_approx_modes
,
894 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
896 16, /* function_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
907 &generic_prefetch_tune
910 static const struct tune_params thunderx2t99_tunings
=
912 &thunderx2t99_extra_costs
,
913 &thunderx2t99_addrcost_table
,
914 &thunderx2t99_regmove_cost
,
915 &thunderx2t99_vector_cost
,
916 &generic_branch_cost
,
917 &generic_approx_modes
,
918 4, /* memmov_cost. */
920 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
922 16, /* function_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
940 void (*parse_override
)(const char*, struct tune_params
*);
943 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions
[] =
949 { "fuse", aarch64_parse_fuse_string
},
950 { "tune", aarch64_parse_tune_string
},
954 /* A processor implementing AArch64. */
957 const char *const name
;
958 enum aarch64_processor ident
;
959 enum aarch64_processor sched_core
;
960 enum aarch64_arch arch
;
961 unsigned architecture_version
;
962 const unsigned long flags
;
963 const struct tune_params
*const tune
;
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures
[] =
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores
[] =
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
984 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
985 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor
*selected_arch
;
992 static const struct processor
*selected_cpu
;
993 static const struct processor
*selected_tune
;
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params
= generic_tunings
;
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1003 const char *const name
;
1004 const unsigned long flags_on
;
1005 const unsigned long flags_off
;
1008 typedef enum aarch64_cond_code
1010 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1011 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1012 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes
[] =
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1027 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1028 const char * branch_format
)
1030 rtx_code_label
* tmp_label
= gen_label_rtx ();
1031 char label_buf
[256];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1034 CODE_LABEL_NUMBER (tmp_label
));
1035 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1036 rtx dest_label
= operands
[pos_label
];
1037 operands
[pos_label
] = tmp_label
;
1039 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1040 output_asm_insn (buffer
, operands
);
1042 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1043 operands
[pos_label
] = dest_label
;
1044 output_asm_insn (buffer
, operands
);
1049 aarch64_err_no_fpadvsimd (machine_mode mode
, const char *msg
)
1051 const char *mc
= FLOAT_MODE_P (mode
) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY
)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc
, msg
);
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc
, msg
);
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1074 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1075 reg_class_t best_class
)
1079 if (allocno_class
!= ALL_REGS
)
1080 return allocno_class
;
1082 if (best_class
!= ALL_REGS
)
1085 mode
= PSEUDO_REGNO_MODE (regno
);
1086 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1092 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1093 return aarch64_tune_params
.min_div_recip_mul_sf
;
1094 return aarch64_tune_params
.min_div_recip_mul_df
;
1098 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED
,
1101 if (VECTOR_MODE_P (mode
))
1102 return aarch64_tune_params
.vec_reassoc_width
;
1103 if (INTEGRAL_MODE_P (mode
))
1104 return aarch64_tune_params
.int_reassoc_width
;
1105 if (FLOAT_MODE_P (mode
))
1106 return aarch64_tune_params
.fp_reassoc_width
;
1110 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1112 aarch64_dbx_register_number (unsigned regno
)
1114 if (GP_REGNUM_P (regno
))
1115 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1116 else if (regno
== SP_REGNUM
)
1117 return AARCH64_DWARF_SP
;
1118 else if (FP_REGNUM_P (regno
))
1119 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1120 else if (PR_REGNUM_P (regno
))
1121 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1122 else if (regno
== VG_REGNUM
)
1123 return AARCH64_DWARF_VG
;
1125 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1126 equivalent DWARF register. */
1127 return DWARF_FRAME_REGISTERS
;
1130 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1132 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1135 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1138 /* Return true if MODE is an SVE predicate mode. */
1140 aarch64_sve_pred_mode_p (machine_mode mode
)
1143 && (mode
== VNx16BImode
1144 || mode
== VNx8BImode
1145 || mode
== VNx4BImode
1146 || mode
== VNx2BImode
));
1149 /* Three mutually-exclusive flags describing a vector or predicate type. */
1150 const unsigned int VEC_ADVSIMD
= 1;
1151 const unsigned int VEC_SVE_DATA
= 2;
1152 const unsigned int VEC_SVE_PRED
= 4;
1153 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1154 a structure of 2, 3 or 4 vectors. */
1155 const unsigned int VEC_STRUCT
= 8;
1156 /* Useful combinations of the above. */
1157 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1158 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1160 /* Return a set of flags describing the vector properties of mode MODE.
1161 Ignore modes that are not supported by the current target. */
1163 aarch64_classify_vector_mode (machine_mode mode
)
1165 if (aarch64_advsimd_struct_mode_p (mode
))
1166 return VEC_ADVSIMD
| VEC_STRUCT
;
1168 if (aarch64_sve_pred_mode_p (mode
))
1169 return VEC_SVE_PRED
;
1171 scalar_mode inner
= GET_MODE_INNER (mode
);
1172 if (VECTOR_MODE_P (mode
)
1179 || inner
== DFmode
))
1182 && known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1183 return VEC_SVE_DATA
;
1185 /* This includes V1DF but not V1DI (which doesn't exist). */
1187 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1188 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1195 /* Return true if MODE is any of the data vector modes, including
1198 aarch64_vector_data_mode_p (machine_mode mode
)
1200 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1203 /* Return true if MODE is an SVE data vector mode; either a single vector
1204 or a structure of vectors. */
1206 aarch64_sve_data_mode_p (machine_mode mode
)
1208 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1211 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1213 aarch64_array_mode_supported_p (machine_mode mode
,
1214 unsigned HOST_WIDE_INT nelems
)
1217 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1218 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1219 && (nelems
>= 2 && nelems
<= 4))
1225 /* Return the SVE predicate mode to use for elements that have
1226 ELEM_NBYTES bytes, if such a mode exists. */
1229 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1233 if (elem_nbytes
== 1)
1235 if (elem_nbytes
== 2)
1237 if (elem_nbytes
== 4)
1239 if (elem_nbytes
== 8)
1242 return opt_machine_mode ();
1245 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1247 static opt_machine_mode
1248 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1250 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1252 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1253 machine_mode pred_mode
;
1254 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1258 return default_get_mask_mode (nunits
, nbytes
);
1261 /* Implement TARGET_HARD_REGNO_NREGS. */
1264 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1266 /* ??? Logically we should only need to provide a value when
1267 HARD_REGNO_MODE_OK says that the combination is valid,
1268 but at the moment we need to handle all modes. Just ignore
1269 any runtime parts for registers that can't store them. */
1270 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1271 switch (aarch64_regno_regclass (regno
))
1275 if (aarch64_sve_data_mode_p (mode
))
1276 return exact_div (GET_MODE_SIZE (mode
),
1277 BYTES_PER_SVE_VECTOR
).to_constant ();
1278 return CEIL (lowest_size
, UNITS_PER_VREG
);
1284 return CEIL (lowest_size
, UNITS_PER_WORD
);
1289 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1292 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1294 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1295 return regno
== CC_REGNUM
;
1297 if (regno
== VG_REGNUM
)
1298 /* This must have the same size as _Unwind_Word. */
1299 return mode
== DImode
;
1301 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1302 if (vec_flags
& VEC_SVE_PRED
)
1303 return PR_REGNUM_P (regno
);
1305 if (PR_REGNUM_P (regno
))
1308 if (regno
== SP_REGNUM
)
1309 /* The purpose of comparing with ptr_mode is to support the
1310 global register variable associated with the stack pointer
1311 register via the syntax of asm ("wsp") in ILP32. */
1312 return mode
== Pmode
|| mode
== ptr_mode
;
1314 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1315 return mode
== Pmode
;
1317 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1320 if (FP_REGNUM_P (regno
))
1322 if (vec_flags
& VEC_STRUCT
)
1323 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1325 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1331 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1332 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1333 clobbers the top 64 bits when restoring the bottom 64 bits. */
1336 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1338 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1341 /* Implement REGMODE_NATURAL_SIZE. */
1343 aarch64_regmode_natural_size (machine_mode mode
)
1345 /* The natural size for SVE data modes is one SVE data vector,
1346 and similarly for predicates. We can't independently modify
1347 anything smaller than that. */
1348 /* ??? For now, only do this for variable-width SVE registers.
1349 Doing it for constant-sized registers breaks lower-subreg.c. */
1350 /* ??? And once that's fixed, we should probably have similar
1351 code for Advanced SIMD. */
1352 if (!aarch64_sve_vg
.is_constant ())
1354 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1355 if (vec_flags
& VEC_SVE_PRED
)
1356 return BYTES_PER_SVE_PRED
;
1357 if (vec_flags
& VEC_SVE_DATA
)
1358 return BYTES_PER_SVE_VECTOR
;
1360 return UNITS_PER_WORD
;
1363 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1365 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1368 /* The predicate mode determines which bits are significant and
1369 which are "don't care". Decreasing the number of lanes would
1370 lose data while increasing the number of lanes would make bits
1371 unnecessarily significant. */
1372 if (PR_REGNUM_P (regno
))
1374 if (known_ge (GET_MODE_SIZE (mode
), 4))
1380 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1381 that strcpy from constants will be faster. */
1383 static HOST_WIDE_INT
1384 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1386 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1387 return MAX (align
, BITS_PER_WORD
);
1391 /* Return true if calls to DECL should be treated as
1392 long-calls (ie called via a register). */
1394 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1399 /* Return true if calls to symbol-ref SYM should be treated as
1400 long-calls (ie called via a register). */
1402 aarch64_is_long_call_p (rtx sym
)
1404 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1407 /* Return true if calls to symbol-ref SYM should not go through
1411 aarch64_is_noplt_call_p (rtx sym
)
1413 const_tree decl
= SYMBOL_REF_DECL (sym
);
1418 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1419 && !targetm
.binds_local_p (decl
))
1425 /* Return true if the offsets to a zero/sign-extract operation
1426 represent an expression that matches an extend operation. The
1427 operands represent the paramters from
1429 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1431 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1434 HOST_WIDE_INT mult_val
, extract_val
;
1436 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1439 mult_val
= INTVAL (mult_imm
);
1440 extract_val
= INTVAL (extract_imm
);
1443 && extract_val
< GET_MODE_BITSIZE (mode
)
1444 && exact_log2 (extract_val
& ~7) > 0
1445 && (extract_val
& 7) <= 4
1446 && mult_val
== (1 << (extract_val
& 7)))
1452 /* Emit an insn that's a simple single-set. Both the operands must be
1453 known to be valid. */
1454 inline static rtx_insn
*
1455 emit_set_insn (rtx x
, rtx y
)
1457 return emit_insn (gen_rtx_SET (x
, y
));
1460 /* X and Y are two things to compare using CODE. Emit the compare insn and
1461 return the rtx for register 0 in the proper mode. */
1463 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1465 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1466 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1468 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1472 /* Build the SYMBOL_REF for __tls_get_addr. */
1474 static GTY(()) rtx tls_get_addr_libfunc
;
1477 aarch64_tls_get_addr (void)
1479 if (!tls_get_addr_libfunc
)
1480 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1481 return tls_get_addr_libfunc
;
1484 /* Return the TLS model to use for ADDR. */
1486 static enum tls_model
1487 tls_symbolic_operand_type (rtx addr
)
1489 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1490 if (GET_CODE (addr
) == CONST
)
1493 rtx sym
= strip_offset (addr
, &addend
);
1494 if (GET_CODE (sym
) == SYMBOL_REF
)
1495 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1497 else if (GET_CODE (addr
) == SYMBOL_REF
)
1498 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1503 /* We'll allow lo_sum's in addresses in our legitimate addresses
1504 so that combine would take care of combining addresses where
1505 necessary, but for generation purposes, we'll generate the address
1508 tmp = hi (symbol_ref); adrp x1, foo
1509 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1513 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1514 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1518 Load TLS symbol, depending on TLS mechanism and TLS access model.
1520 Global Dynamic - Traditional TLS:
1521 adrp tmp, :tlsgd:imm
1522 add dest, tmp, #:tlsgd_lo12:imm
1525 Global Dynamic - TLS Descriptors:
1526 adrp dest, :tlsdesc:imm
1527 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1528 add dest, dest, #:tlsdesc_lo12:imm
1535 adrp tmp, :gottprel:imm
1536 ldr dest, [tmp, #:gottprel_lo12:imm]
1541 add t0, tp, #:tprel_hi12:imm, lsl #12
1542 add t0, t0, #:tprel_lo12_nc:imm
1546 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1547 enum aarch64_symbol_type type
)
1551 case SYMBOL_SMALL_ABSOLUTE
:
1553 /* In ILP32, the mode of dest can be either SImode or DImode. */
1555 machine_mode mode
= GET_MODE (dest
);
1557 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1559 if (can_create_pseudo_p ())
1560 tmp_reg
= gen_reg_rtx (mode
);
1562 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1563 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1567 case SYMBOL_TINY_ABSOLUTE
:
1568 emit_insn (gen_rtx_SET (dest
, imm
));
1571 case SYMBOL_SMALL_GOT_28K
:
1573 machine_mode mode
= GET_MODE (dest
);
1574 rtx gp_rtx
= pic_offset_table_rtx
;
1578 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1579 here before rtl expand. Tree IVOPT will generate rtl pattern to
1580 decide rtx costs, in which case pic_offset_table_rtx is not
1581 initialized. For that case no need to generate the first adrp
1582 instruction as the final cost for global variable access is
1586 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1587 using the page base as GOT base, the first page may be wasted,
1588 in the worst scenario, there is only 28K space for GOT).
1590 The generate instruction sequence for accessing global variable
1593 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1595 Only one instruction needed. But we must initialize
1596 pic_offset_table_rtx properly. We generate initialize insn for
1597 every global access, and allow CSE to remove all redundant.
1599 The final instruction sequences will look like the following
1600 for multiply global variables access.
1602 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1604 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1605 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1606 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1609 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1610 crtl
->uses_pic_offset_table
= 1;
1611 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1613 if (mode
!= GET_MODE (gp_rtx
))
1614 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1618 if (mode
== ptr_mode
)
1621 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1623 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1625 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1629 gcc_assert (mode
== Pmode
);
1631 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1632 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1635 /* The operand is expected to be MEM. Whenever the related insn
1636 pattern changed, above code which calculate mem should be
1638 gcc_assert (GET_CODE (mem
) == MEM
);
1639 MEM_READONLY_P (mem
) = 1;
1640 MEM_NOTRAP_P (mem
) = 1;
1645 case SYMBOL_SMALL_GOT_4G
:
1647 /* In ILP32, the mode of dest can be either SImode or DImode,
1648 while the got entry is always of SImode size. The mode of
1649 dest depends on how dest is used: if dest is assigned to a
1650 pointer (e.g. in the memory), it has SImode; it may have
1651 DImode if dest is dereferenced to access the memeory.
1652 This is why we have to handle three different ldr_got_small
1653 patterns here (two patterns for ILP32). */
1658 machine_mode mode
= GET_MODE (dest
);
1660 if (can_create_pseudo_p ())
1661 tmp_reg
= gen_reg_rtx (mode
);
1663 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1664 if (mode
== ptr_mode
)
1667 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1669 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1671 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1675 gcc_assert (mode
== Pmode
);
1677 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1678 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1681 gcc_assert (GET_CODE (mem
) == MEM
);
1682 MEM_READONLY_P (mem
) = 1;
1683 MEM_NOTRAP_P (mem
) = 1;
1688 case SYMBOL_SMALL_TLSGD
:
1691 machine_mode mode
= GET_MODE (dest
);
1692 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1696 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1698 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1699 insns
= get_insns ();
1702 RTL_CONST_CALL_P (insns
) = 1;
1703 emit_libcall_block (insns
, dest
, result
, imm
);
1707 case SYMBOL_SMALL_TLSDESC
:
1709 machine_mode mode
= GET_MODE (dest
);
1710 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1713 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1715 /* In ILP32, the got entry is always of SImode size. Unlike
1716 small GOT, the dest is fixed at reg 0. */
1718 emit_insn (gen_tlsdesc_small_si (imm
));
1720 emit_insn (gen_tlsdesc_small_di (imm
));
1721 tp
= aarch64_load_tp (NULL
);
1724 tp
= gen_lowpart (mode
, tp
);
1726 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1728 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1732 case SYMBOL_SMALL_TLSIE
:
1734 /* In ILP32, the mode of dest can be either SImode or DImode,
1735 while the got entry is always of SImode size. The mode of
1736 dest depends on how dest is used: if dest is assigned to a
1737 pointer (e.g. in the memory), it has SImode; it may have
1738 DImode if dest is dereferenced to access the memeory.
1739 This is why we have to handle three different tlsie_small
1740 patterns here (two patterns for ILP32). */
1741 machine_mode mode
= GET_MODE (dest
);
1742 rtx tmp_reg
= gen_reg_rtx (mode
);
1743 rtx tp
= aarch64_load_tp (NULL
);
1745 if (mode
== ptr_mode
)
1748 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1751 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1752 tp
= gen_lowpart (mode
, tp
);
1757 gcc_assert (mode
== Pmode
);
1758 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1761 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1763 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1767 case SYMBOL_TLSLE12
:
1768 case SYMBOL_TLSLE24
:
1769 case SYMBOL_TLSLE32
:
1770 case SYMBOL_TLSLE48
:
1772 machine_mode mode
= GET_MODE (dest
);
1773 rtx tp
= aarch64_load_tp (NULL
);
1776 tp
= gen_lowpart (mode
, tp
);
1780 case SYMBOL_TLSLE12
:
1781 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1784 case SYMBOL_TLSLE24
:
1785 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1788 case SYMBOL_TLSLE32
:
1789 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1791 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1794 case SYMBOL_TLSLE48
:
1795 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1797 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1805 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1809 case SYMBOL_TINY_GOT
:
1810 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1813 case SYMBOL_TINY_TLSIE
:
1815 machine_mode mode
= GET_MODE (dest
);
1816 rtx tp
= aarch64_load_tp (NULL
);
1818 if (mode
== ptr_mode
)
1821 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1824 tp
= gen_lowpart (mode
, tp
);
1825 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1830 gcc_assert (mode
== Pmode
);
1831 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1835 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1844 /* Emit a move from SRC to DEST. Assume that the move expanders can
1845 handle all moves if !can_create_pseudo_p (). The distinction is
1846 important because, unlike emit_move_insn, the move expanders know
1847 how to force Pmode objects into the constant pool even when the
1848 constant pool address is not itself legitimate. */
1850 aarch64_emit_move (rtx dest
, rtx src
)
1852 return (can_create_pseudo_p ()
1853 ? emit_move_insn (dest
, src
)
1854 : emit_move_insn_1 (dest
, src
));
1857 /* Split a 128-bit move operation into two 64-bit move operations,
1858 taking care to handle partial overlap of register to register
1859 copies. Special cases are needed when moving between GP regs and
1860 FP regs. SRC can be a register, constant or memory; DST a register
1861 or memory. If either operand is memory it must not have any side
1864 aarch64_split_128bit_move (rtx dst
, rtx src
)
1869 machine_mode mode
= GET_MODE (dst
);
1871 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1872 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1873 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1875 if (REG_P (dst
) && REG_P (src
))
1877 int src_regno
= REGNO (src
);
1878 int dst_regno
= REGNO (dst
);
1880 /* Handle FP <-> GP regs. */
1881 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1883 src_lo
= gen_lowpart (word_mode
, src
);
1884 src_hi
= gen_highpart (word_mode
, src
);
1888 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1889 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1893 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1894 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1898 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1900 dst_lo
= gen_lowpart (word_mode
, dst
);
1901 dst_hi
= gen_highpart (word_mode
, dst
);
1905 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1906 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1910 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1911 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
1917 dst_lo
= gen_lowpart (word_mode
, dst
);
1918 dst_hi
= gen_highpart (word_mode
, dst
);
1919 src_lo
= gen_lowpart (word_mode
, src
);
1920 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
1922 /* At most one pairing may overlap. */
1923 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
1925 aarch64_emit_move (dst_hi
, src_hi
);
1926 aarch64_emit_move (dst_lo
, src_lo
);
1930 aarch64_emit_move (dst_lo
, src_lo
);
1931 aarch64_emit_move (dst_hi
, src_hi
);
1936 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
1938 return (! REG_P (src
)
1939 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
1942 /* Split a complex SIMD combine. */
1945 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
1947 machine_mode src_mode
= GET_MODE (src1
);
1948 machine_mode dst_mode
= GET_MODE (dst
);
1950 gcc_assert (VECTOR_MODE_P (dst_mode
));
1951 gcc_assert (register_operand (dst
, dst_mode
)
1952 && register_operand (src1
, src_mode
)
1953 && register_operand (src2
, src_mode
));
1955 rtx (*gen
) (rtx
, rtx
, rtx
);
1960 gen
= gen_aarch64_simd_combinev8qi
;
1963 gen
= gen_aarch64_simd_combinev4hi
;
1966 gen
= gen_aarch64_simd_combinev2si
;
1969 gen
= gen_aarch64_simd_combinev4hf
;
1972 gen
= gen_aarch64_simd_combinev2sf
;
1975 gen
= gen_aarch64_simd_combinedi
;
1978 gen
= gen_aarch64_simd_combinedf
;
1984 emit_insn (gen (dst
, src1
, src2
));
1988 /* Split a complex SIMD move. */
1991 aarch64_split_simd_move (rtx dst
, rtx src
)
1993 machine_mode src_mode
= GET_MODE (src
);
1994 machine_mode dst_mode
= GET_MODE (dst
);
1996 gcc_assert (VECTOR_MODE_P (dst_mode
));
1998 if (REG_P (dst
) && REG_P (src
))
2000 rtx (*gen
) (rtx
, rtx
);
2002 gcc_assert (VECTOR_MODE_P (src_mode
));
2007 gen
= gen_aarch64_split_simd_movv16qi
;
2010 gen
= gen_aarch64_split_simd_movv8hi
;
2013 gen
= gen_aarch64_split_simd_movv4si
;
2016 gen
= gen_aarch64_split_simd_movv2di
;
2019 gen
= gen_aarch64_split_simd_movv8hf
;
2022 gen
= gen_aarch64_split_simd_movv4sf
;
2025 gen
= gen_aarch64_split_simd_movv2df
;
2031 emit_insn (gen (dst
, src
));
2037 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2038 machine_mode ymode
, rtx y
)
2040 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2041 gcc_assert (r
!= NULL
);
2042 return rtx_equal_p (x
, r
);
2047 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2049 if (can_create_pseudo_p ())
2050 return force_reg (mode
, value
);
2054 aarch64_emit_move (x
, value
);
2059 /* Return true if we can move VALUE into a register using a single
2060 CNT[BHWD] instruction. */
2063 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2065 HOST_WIDE_INT factor
= value
.coeffs
[0];
2066 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2067 return (value
.coeffs
[1] == factor
2068 && IN_RANGE (factor
, 2, 16 * 16)
2069 && (factor
& 1) == 0
2070 && factor
<= 16 * (factor
& -factor
));
2073 /* Likewise for rtx X. */
2076 aarch64_sve_cnt_immediate_p (rtx x
)
2079 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2082 /* Return the asm string for an instruction with a CNT-like vector size
2083 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2084 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2085 first part of the operands template (the part that comes before the
2086 vector size itself). FACTOR is the number of quadwords.
2087 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2088 If it is zero, we can use any element size. */
2091 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2092 unsigned int factor
,
2093 unsigned int nelts_per_vq
)
2095 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2097 if (nelts_per_vq
== 0)
2098 /* There is some overlap in the ranges of the four CNT instructions.
2099 Here we always use the smallest possible element size, so that the
2100 multiplier is 1 whereever possible. */
2101 nelts_per_vq
= factor
& -factor
;
2102 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2103 gcc_assert (IN_RANGE (shift
, 1, 4));
2104 char suffix
= "dwhb"[shift
- 1];
2107 unsigned int written
;
2109 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2110 prefix
, suffix
, operands
);
2112 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2113 prefix
, suffix
, operands
, factor
);
2114 gcc_assert (written
< sizeof (buffer
));
2118 /* Return the asm string for an instruction with a CNT-like vector size
2119 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2120 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2121 first part of the operands template (the part that comes before the
2122 vector size itself). X is the value of the vector size operand,
2123 as a polynomial integer rtx. */
2126 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2129 poly_int64 value
= rtx_to_poly_int64 (x
);
2130 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2131 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2132 value
.coeffs
[1], 0);
2135 /* Return true if we can add VALUE to a register using a single ADDVL
2136 or ADDPL instruction. */
2139 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2141 HOST_WIDE_INT factor
= value
.coeffs
[0];
2142 if (factor
== 0 || value
.coeffs
[1] != factor
)
2144 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2145 and a value of 16 is one vector width. */
2146 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2147 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2150 /* Likewise for rtx X. */
2153 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2156 return (poly_int_rtx_p (x
, &value
)
2157 && aarch64_sve_addvl_addpl_immediate_p (value
));
2160 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2161 and storing the result in operand 0. */
2164 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2166 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2167 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2168 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2170 /* Use INC or DEC if possible. */
2171 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2173 if (aarch64_sve_cnt_immediate_p (offset_value
))
2174 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2175 offset_value
.coeffs
[1], 0);
2176 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2177 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2178 -offset_value
.coeffs
[1], 0);
2181 int factor
= offset_value
.coeffs
[1];
2182 if ((factor
& 15) == 0)
2183 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2185 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2189 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2190 instruction. If it is, store the number of elements in each vector
2191 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2192 factor in *FACTOR_OUT (if nonnull). */
2195 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2196 unsigned int *nelts_per_vq_out
)
2201 if (!const_vec_duplicate_p (x
, &elt
)
2202 || !poly_int_rtx_p (elt
, &value
))
2205 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2206 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2207 /* There's no vector INCB. */
2210 HOST_WIDE_INT factor
= value
.coeffs
[0];
2211 if (value
.coeffs
[1] != factor
)
2214 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2215 if ((factor
% nelts_per_vq
) != 0
2216 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2220 *factor_out
= factor
;
2221 if (nelts_per_vq_out
)
2222 *nelts_per_vq_out
= nelts_per_vq
;
2226 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2230 aarch64_sve_inc_dec_immediate_p (rtx x
)
2232 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2235 /* Return the asm template for an SVE vector INC or DEC instruction.
2236 OPERANDS gives the operands before the vector count and X is the
2237 value of the vector count operand itself. */
2240 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2243 unsigned int nelts_per_vq
;
2244 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2247 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2250 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2255 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2256 scalar_int_mode mode
)
2259 unsigned HOST_WIDE_INT val
, val2
, mask
;
2260 int one_match
, zero_match
;
2265 if (aarch64_move_imm (val
, mode
))
2268 emit_insn (gen_rtx_SET (dest
, imm
));
2272 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2273 (with XXXX non-zero). In that case check to see if the move can be done in
2275 val2
= val
& 0xffffffff;
2277 && aarch64_move_imm (val2
, SImode
)
2278 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2281 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2283 /* Check if we have to emit a second instruction by checking to see
2284 if any of the upper 32 bits of the original DI mode value is set. */
2288 i
= (val
>> 48) ? 48 : 32;
2291 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2292 GEN_INT ((val
>> i
) & 0xffff)));
2297 if ((val
>> 32) == 0 || mode
== SImode
)
2301 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2303 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2304 GEN_INT ((val
>> 16) & 0xffff)));
2306 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2307 GEN_INT ((val
>> 16) & 0xffff)));
2312 /* Remaining cases are all for DImode. */
2315 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2316 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2317 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2318 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2320 if (zero_match
!= 2 && one_match
!= 2)
2322 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2323 For a 64-bit bitmask try whether changing 16 bits to all ones or
2324 zeroes creates a valid bitmask. To check any repeated bitmask,
2325 try using 16 bits from the other 32-bit half of val. */
2327 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2330 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2333 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2335 val2
= val2
& ~mask
;
2336 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2337 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2344 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2345 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2346 GEN_INT ((val
>> i
) & 0xffff)));
2352 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2353 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2354 otherwise skip zero bits. */
2358 val2
= one_match
> zero_match
? ~val
: val
;
2359 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2362 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2363 ? (val
| ~(mask
<< i
))
2364 : (val
& (mask
<< i
)))));
2365 for (i
+= 16; i
< 64; i
+= 16)
2367 if ((val2
& (mask
<< i
)) == 0)
2370 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2371 GEN_INT ((val
>> i
) & 0xffff)));
2378 /* Return the number of temporary registers that aarch64_add_offset_1
2379 would need to add OFFSET to a register. */
2382 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2384 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2387 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2388 a non-polynomial OFFSET. MODE is the mode of the addition.
2389 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2390 be set and CFA adjustments added to the generated instructions.
2392 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2393 temporary if register allocation is already complete. This temporary
2394 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2395 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2396 the immediate again.
2398 Since this function may be used to adjust the stack pointer, we must
2399 ensure that it cannot cause transient stack deallocation (for example
2400 by first incrementing SP and then decrementing when adjusting by a
2401 large immediate). */
2404 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2405 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2406 bool frame_related_p
, bool emit_move_imm
)
2408 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2409 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2411 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2416 if (!rtx_equal_p (dest
, src
))
2418 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2419 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2424 /* Single instruction adjustment. */
2425 if (aarch64_uimm12_shift (moffset
))
2427 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2428 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2432 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2435 a) the offset cannot be loaded by a 16-bit move or
2436 b) there is no spare register into which we can move it. */
2437 if (moffset
< 0x1000000
2438 && ((!temp1
&& !can_create_pseudo_p ())
2439 || !aarch64_move_imm (moffset
, mode
)))
2441 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2443 low_off
= offset
< 0 ? -low_off
: low_off
;
2444 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2445 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2446 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2447 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2451 /* Emit a move immediate if required and an addition/subtraction. */
2454 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2455 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2457 insn
= emit_insn (offset
< 0
2458 ? gen_sub3_insn (dest
, src
, temp1
)
2459 : gen_add3_insn (dest
, src
, temp1
));
2460 if (frame_related_p
)
2462 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2463 rtx adj
= plus_constant (mode
, src
, offset
);
2464 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2468 /* Return the number of temporary registers that aarch64_add_offset
2469 would need to move OFFSET into a register or add OFFSET to a register;
2470 ADD_P is true if we want the latter rather than the former. */
2473 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2475 /* This follows the same structure as aarch64_add_offset. */
2476 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2479 unsigned int count
= 0;
2480 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2481 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2482 poly_int64
poly_offset (factor
, factor
);
2483 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2484 /* Need one register for the ADDVL/ADDPL result. */
2486 else if (factor
!= 0)
2488 factor
= abs (factor
);
2489 if (factor
> 16 * (factor
& -factor
))
2490 /* Need one register for the CNT result and one for the multiplication
2491 factor. If necessary, the second temporary can be reused for the
2492 constant part of the offset. */
2494 /* Need one register for the CNT result (which might then
2498 return count
+ aarch64_add_offset_1_temporaries (constant
);
2501 /* If X can be represented as a poly_int64, return the number
2502 of temporaries that are required to add it to a register.
2503 Return -1 otherwise. */
2506 aarch64_add_offset_temporaries (rtx x
)
2509 if (!poly_int_rtx_p (x
, &offset
))
2511 return aarch64_offset_temporaries (true, offset
);
2514 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2515 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2516 be set and CFA adjustments added to the generated instructions.
2518 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2519 temporary if register allocation is already complete. This temporary
2520 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2521 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2522 false to avoid emitting the immediate again.
2524 TEMP2, if nonnull, is a second temporary register that doesn't
2525 overlap either DEST or REG.
2527 Since this function may be used to adjust the stack pointer, we must
2528 ensure that it cannot cause transient stack deallocation (for example
2529 by first incrementing SP and then decrementing when adjusting by a
2530 large immediate). */
2533 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2534 poly_int64 offset
, rtx temp1
, rtx temp2
,
2535 bool frame_related_p
, bool emit_move_imm
= true)
2537 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2538 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2539 gcc_assert (temp1
== NULL_RTX
2541 || !reg_overlap_mentioned_p (temp1
, dest
));
2542 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2544 /* Try using ADDVL or ADDPL to add the whole value. */
2545 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2547 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2548 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2549 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2553 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2554 SVE vector register, over and above the minimum size of 128 bits.
2555 This is equivalent to half the value returned by CNTD with a
2556 vector shape of ALL. */
2557 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2558 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2560 /* Try using ADDVL or ADDPL to add the VG-based part. */
2561 poly_int64
poly_offset (factor
, factor
);
2562 if (src
!= const0_rtx
2563 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2565 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2566 if (frame_related_p
)
2568 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2569 RTX_FRAME_RELATED_P (insn
) = true;
2574 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2575 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2580 /* Otherwise use a CNT-based sequence. */
2581 else if (factor
!= 0)
2583 /* Use a subtraction if we have a negative factor. */
2584 rtx_code code
= PLUS
;
2591 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2592 into the multiplication. */
2596 /* Use a right shift by 1. */
2600 HOST_WIDE_INT low_bit
= factor
& -factor
;
2601 if (factor
<= 16 * low_bit
)
2603 if (factor
> 16 * 8)
2605 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2606 the value with the minimum multiplier and shift it into
2608 int extra_shift
= exact_log2 (low_bit
);
2609 shift
+= extra_shift
;
2610 factor
>>= extra_shift
;
2612 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2616 /* Use CNTD, then multiply it by FACTOR. */
2617 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2618 val
= aarch64_force_temporary (mode
, temp1
, val
);
2620 /* Go back to using a negative multiplication factor if we have
2621 no register from which to subtract. */
2622 if (code
== MINUS
&& src
== const0_rtx
)
2627 rtx coeff1
= gen_int_mode (factor
, mode
);
2628 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2629 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2634 /* Multiply by 1 << SHIFT. */
2635 val
= aarch64_force_temporary (mode
, temp1
, val
);
2636 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2638 else if (shift
== -1)
2641 val
= aarch64_force_temporary (mode
, temp1
, val
);
2642 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2645 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2646 if (src
!= const0_rtx
)
2648 val
= aarch64_force_temporary (mode
, temp1
, val
);
2649 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2651 else if (code
== MINUS
)
2653 val
= aarch64_force_temporary (mode
, temp1
, val
);
2654 val
= gen_rtx_NEG (mode
, val
);
2657 if (constant
== 0 || frame_related_p
)
2659 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2660 if (frame_related_p
)
2662 RTX_FRAME_RELATED_P (insn
) = true;
2663 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2664 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2673 src
= aarch64_force_temporary (mode
, temp1
, val
);
2678 emit_move_imm
= true;
2681 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2682 frame_related_p
, emit_move_imm
);
2685 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2686 than a poly_int64. */
2689 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2690 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2692 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2693 temp1
, temp2
, false);
2696 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2697 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2698 if TEMP1 already contains abs (DELTA). */
2701 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2703 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2704 temp1
, temp2
, true, emit_move_imm
);
2707 /* Subtract DELTA from the stack pointer, marking the instructions
2708 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2712 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2714 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2715 temp1
, temp2
, frame_related_p
);
2718 /* Set DEST to (vec_series BASE STEP). */
2721 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2723 machine_mode mode
= GET_MODE (dest
);
2724 scalar_mode inner
= GET_MODE_INNER (mode
);
2726 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2727 if (!aarch64_sve_index_immediate_p (base
))
2728 base
= force_reg (inner
, base
);
2729 if (!aarch64_sve_index_immediate_p (step
))
2730 step
= force_reg (inner
, step
);
2732 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2735 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2736 integer of mode INT_MODE. Return true on success. */
2739 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2742 /* If the constant is smaller than 128 bits, we can do the move
2743 using a vector of SRC_MODEs. */
2744 if (src_mode
!= TImode
)
2746 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2747 GET_MODE_SIZE (src_mode
));
2748 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2749 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2750 gen_const_vec_duplicate (dup_mode
, src
));
2754 /* The bytes are loaded in little-endian order, so do a byteswap on
2755 big-endian targets. */
2756 if (BYTES_BIG_ENDIAN
)
2758 src
= simplify_unary_operation (BSWAP
, src_mode
, src
, src_mode
);
2763 /* Use LD1RQ to load the 128 bits from memory. */
2764 src
= force_const_mem (src_mode
, src
);
2768 /* Make sure that the address is legitimate. */
2769 if (!aarch64_sve_ld1r_operand_p (src
))
2771 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2772 src
= replace_equiv_address (src
, addr
);
2775 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2776 emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode
, dest
), ptrue
, src
));
2780 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2781 isn't a simple duplicate or series. */
2784 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2786 machine_mode mode
= GET_MODE (src
);
2787 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2788 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2789 gcc_assert (npatterns
> 1);
2791 if (nelts_per_pattern
== 1)
2793 /* The constant is a repeating seqeuence of at least two elements,
2794 where the repeating elements occupy no more than 128 bits.
2795 Get an integer representation of the replicated value. */
2796 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2797 gcc_assert (int_bits
<= 128);
2799 scalar_int_mode int_mode
= int_mode_for_size (int_bits
, 0).require ();
2800 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2802 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2806 /* Expand each pattern individually. */
2807 rtx_vector_builder builder
;
2808 auto_vec
<rtx
, 16> vectors (npatterns
);
2809 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2811 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2812 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2813 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2814 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2817 /* Use permutes to interleave the separate vectors. */
2818 while (npatterns
> 1)
2821 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2823 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2824 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2825 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2829 gcc_assert (vectors
[0] == dest
);
2832 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2833 is a pattern that can be used to set DEST to a replicated scalar
2837 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2838 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2840 machine_mode mode
= GET_MODE (dest
);
2842 /* Check on what type of symbol it is. */
2843 scalar_int_mode int_mode
;
2844 if ((GET_CODE (imm
) == SYMBOL_REF
2845 || GET_CODE (imm
) == LABEL_REF
2846 || GET_CODE (imm
) == CONST
2847 || GET_CODE (imm
) == CONST_POLY_INT
)
2848 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2852 HOST_WIDE_INT const_offset
;
2853 enum aarch64_symbol_type sty
;
2855 /* If we have (const (plus symbol offset)), separate out the offset
2856 before we start classifying the symbol. */
2857 rtx base
= strip_offset (imm
, &offset
);
2859 /* We must always add an offset involving VL separately, rather than
2860 folding it into the relocation. */
2861 if (!offset
.is_constant (&const_offset
))
2863 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2864 emit_insn (gen_rtx_SET (dest
, imm
));
2867 /* Do arithmetic on 32-bit values if the result is smaller
2869 if (partial_subreg_p (int_mode
, SImode
))
2871 /* It is invalid to do symbol calculations in modes
2872 narrower than SImode. */
2873 gcc_assert (base
== const0_rtx
);
2874 dest
= gen_lowpart (SImode
, dest
);
2877 if (base
!= const0_rtx
)
2879 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2880 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2881 NULL_RTX
, NULL_RTX
, false);
2884 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2885 dest
, NULL_RTX
, false);
2890 sty
= aarch64_classify_symbol (base
, const_offset
);
2893 case SYMBOL_FORCE_TO_MEM
:
2894 if (const_offset
!= 0
2895 && targetm
.cannot_force_const_mem (int_mode
, imm
))
2897 gcc_assert (can_create_pseudo_p ());
2898 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2899 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
2900 NULL_RTX
, NULL_RTX
, false);
2904 mem
= force_const_mem (ptr_mode
, imm
);
2907 /* If we aren't generating PC relative literals, then
2908 we need to expand the literal pool access carefully.
2909 This is something that needs to be done in a number
2910 of places, so could well live as a separate function. */
2911 if (!aarch64_pcrelative_literal_loads
)
2913 gcc_assert (can_create_pseudo_p ());
2914 base
= gen_reg_rtx (ptr_mode
);
2915 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
2916 if (ptr_mode
!= Pmode
)
2917 base
= convert_memory_address (Pmode
, base
);
2918 mem
= gen_rtx_MEM (ptr_mode
, base
);
2921 if (int_mode
!= ptr_mode
)
2922 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
2924 emit_insn (gen_rtx_SET (dest
, mem
));
2928 case SYMBOL_SMALL_TLSGD
:
2929 case SYMBOL_SMALL_TLSDESC
:
2930 case SYMBOL_SMALL_TLSIE
:
2931 case SYMBOL_SMALL_GOT_28K
:
2932 case SYMBOL_SMALL_GOT_4G
:
2933 case SYMBOL_TINY_GOT
:
2934 case SYMBOL_TINY_TLSIE
:
2935 if (const_offset
!= 0)
2937 gcc_assert(can_create_pseudo_p ());
2938 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2939 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
2940 NULL_RTX
, NULL_RTX
, false);
2945 case SYMBOL_SMALL_ABSOLUTE
:
2946 case SYMBOL_TINY_ABSOLUTE
:
2947 case SYMBOL_TLSLE12
:
2948 case SYMBOL_TLSLE24
:
2949 case SYMBOL_TLSLE32
:
2950 case SYMBOL_TLSLE48
:
2951 aarch64_load_symref_appropriately (dest
, imm
, sty
);
2959 if (!CONST_INT_P (imm
))
2961 rtx base
, step
, value
;
2962 if (GET_CODE (imm
) == HIGH
2963 || aarch64_simd_valid_immediate (imm
, NULL
))
2964 emit_insn (gen_rtx_SET (dest
, imm
));
2965 else if (const_vec_series_p (imm
, &base
, &step
))
2966 aarch64_expand_vec_series (dest
, base
, step
);
2967 else if (const_vec_duplicate_p (imm
, &value
))
2969 /* If the constant is out of range of an SVE vector move,
2970 load it from memory if we can, otherwise move it into
2971 a register and use a DUP. */
2972 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
2973 rtx op
= force_const_mem (inner_mode
, value
);
2975 op
= force_reg (inner_mode
, value
);
2976 else if (!aarch64_sve_ld1r_operand_p (op
))
2978 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
2979 op
= replace_equiv_address (op
, addr
);
2981 emit_insn (gen_vec_duplicate (dest
, op
));
2983 else if (GET_CODE (imm
) == CONST_VECTOR
2984 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
2985 aarch64_expand_sve_const_vector (dest
, imm
);
2988 rtx mem
= force_const_mem (mode
, imm
);
2990 emit_move_insn (dest
, mem
);
2996 aarch64_internal_mov_immediate (dest
, imm
, true,
2997 as_a
<scalar_int_mode
> (mode
));
3000 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3001 that is known to contain PTRUE. */
3004 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3006 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3007 gen_rtvec (2, pred
, src
),
3008 UNSPEC_MERGE_PTRUE
)));
3011 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3012 operand is in memory. In this case we need to use the predicated LD1
3013 and ST1 instead of LDR and STR, both for correctness on big-endian
3014 targets and because LD1 and ST1 support a wider range of addressing modes.
3015 PRED_MODE is the mode of the predicate.
3017 See the comment at the head of aarch64-sve.md for details about the
3018 big-endian handling. */
3021 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3023 machine_mode mode
= GET_MODE (dest
);
3024 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3025 if (!register_operand (src
, mode
)
3026 && !register_operand (dest
, mode
))
3028 rtx tmp
= gen_reg_rtx (mode
);
3030 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3032 emit_move_insn (tmp
, src
);
3035 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3039 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3040 tree exp ATTRIBUTE_UNUSED
)
3042 /* Currently, always true. */
3046 /* Implement TARGET_PASS_BY_REFERENCE. */
3049 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3052 bool named ATTRIBUTE_UNUSED
)
3055 machine_mode dummymode
;
3058 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3059 if (mode
== BLKmode
&& type
)
3060 size
= int_size_in_bytes (type
);
3062 /* No frontends can create types with variable-sized modes, so we
3063 shouldn't be asked to pass or return them. */
3064 size
= GET_MODE_SIZE (mode
).to_constant ();
3066 /* Aggregates are passed by reference based on their size. */
3067 if (type
&& AGGREGATE_TYPE_P (type
))
3069 size
= int_size_in_bytes (type
);
3072 /* Variable sized arguments are always returned by reference. */
3076 /* Can this be a candidate to be passed in fp/simd register(s)? */
3077 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3082 /* Arguments which are variable sized or larger than 2 registers are
3083 passed by reference unless they are a homogenous floating point
3085 return size
> 2 * UNITS_PER_WORD
;
3088 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3090 aarch64_return_in_msb (const_tree valtype
)
3092 machine_mode dummy_mode
;
3095 /* Never happens in little-endian mode. */
3096 if (!BYTES_BIG_ENDIAN
)
3099 /* Only composite types smaller than or equal to 16 bytes can
3100 be potentially returned in registers. */
3101 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3102 || int_size_in_bytes (valtype
) <= 0
3103 || int_size_in_bytes (valtype
) > 16)
3106 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3107 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3108 is always passed/returned in the least significant bits of fp/simd
3110 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3111 &dummy_mode
, &dummy_int
, NULL
))
3117 /* Implement TARGET_FUNCTION_VALUE.
3118 Define how to find the value returned by a function. */
3121 aarch64_function_value (const_tree type
, const_tree func
,
3122 bool outgoing ATTRIBUTE_UNUSED
)
3127 machine_mode ag_mode
;
3129 mode
= TYPE_MODE (type
);
3130 if (INTEGRAL_TYPE_P (type
))
3131 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3133 if (aarch64_return_in_msb (type
))
3135 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3137 if (size
% UNITS_PER_WORD
!= 0)
3139 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3140 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3144 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3145 &ag_mode
, &count
, NULL
))
3147 if (!aarch64_composite_type_p (type
, mode
))
3149 gcc_assert (count
== 1 && mode
== ag_mode
);
3150 return gen_rtx_REG (mode
, V0_REGNUM
);
3157 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3158 for (i
= 0; i
< count
; i
++)
3160 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3161 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3162 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3163 XVECEXP (par
, 0, i
) = tmp
;
3169 return gen_rtx_REG (mode
, R0_REGNUM
);
3172 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3173 Return true if REGNO is the number of a hard register in which the values
3174 of called function may come back. */
3177 aarch64_function_value_regno_p (const unsigned int regno
)
3179 /* Maximum of 16 bytes can be returned in the general registers. Examples
3180 of 16-byte return values are: 128-bit integers and 16-byte small
3181 structures (excluding homogeneous floating-point aggregates). */
3182 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3185 /* Up to four fp/simd registers can return a function value, e.g. a
3186 homogeneous floating-point aggregate having four members. */
3187 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3188 return TARGET_FLOAT
;
3193 /* Implement TARGET_RETURN_IN_MEMORY.
3195 If the type T of the result of a function is such that
3197 would require that arg be passed as a value in a register (or set of
3198 registers) according to the parameter passing rules, then the result
3199 is returned in the same registers as would be used for such an
3203 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3206 machine_mode ag_mode
;
3209 if (!AGGREGATE_TYPE_P (type
)
3210 && TREE_CODE (type
) != COMPLEX_TYPE
3211 && TREE_CODE (type
) != VECTOR_TYPE
)
3212 /* Simple scalar types always returned in registers. */
3215 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3222 /* Types larger than 2 registers returned in memory. */
3223 size
= int_size_in_bytes (type
);
3224 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3228 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3229 const_tree type
, int *nregs
)
3231 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3232 return aarch64_vfp_is_call_or_return_candidate (mode
,
3234 &pcum
->aapcs_vfp_rmode
,
3239 /* Given MODE and TYPE of a function argument, return the alignment in
3240 bits. The idea is to suppress any stronger alignment requested by
3241 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3242 This is a helper function for local use only. */
3245 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3248 return GET_MODE_ALIGNMENT (mode
);
3250 if (integer_zerop (TYPE_SIZE (type
)))
3253 gcc_assert (TYPE_MODE (type
) == mode
);
3255 if (!AGGREGATE_TYPE_P (type
))
3256 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3258 if (TREE_CODE (type
) == ARRAY_TYPE
)
3259 return TYPE_ALIGN (TREE_TYPE (type
));
3261 unsigned int alignment
= 0;
3262 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3263 if (TREE_CODE (field
) == FIELD_DECL
)
3264 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3269 /* Layout a function argument according to the AAPCS64 rules. The rule
3270 numbers refer to the rule numbers in the AAPCS64. */
3273 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3275 bool named ATTRIBUTE_UNUSED
)
3277 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3278 int ncrn
, nvrn
, nregs
;
3279 bool allocate_ncrn
, allocate_nvrn
;
3282 /* We need to do this once per argument. */
3283 if (pcum
->aapcs_arg_processed
)
3286 pcum
->aapcs_arg_processed
= true;
3288 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3290 size
= int_size_in_bytes (type
);
3292 /* No frontends can create types with variable-sized modes, so we
3293 shouldn't be asked to pass or return them. */
3294 size
= GET_MODE_SIZE (mode
).to_constant ();
3295 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3297 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3298 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3303 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3304 The following code thus handles passing by SIMD/FP registers first. */
3306 nvrn
= pcum
->aapcs_nvrn
;
3308 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3309 and homogenous short-vector aggregates (HVA). */
3313 aarch64_err_no_fpadvsimd (mode
, "argument");
3315 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3317 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3318 if (!aarch64_composite_type_p (type
, mode
))
3320 gcc_assert (nregs
== 1);
3321 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3327 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3328 for (i
= 0; i
< nregs
; i
++)
3330 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3331 V0_REGNUM
+ nvrn
+ i
);
3332 rtx offset
= gen_int_mode
3333 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3334 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3335 XVECEXP (par
, 0, i
) = tmp
;
3337 pcum
->aapcs_reg
= par
;
3343 /* C.3 NSRN is set to 8. */
3344 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3349 ncrn
= pcum
->aapcs_ncrn
;
3350 nregs
= size
/ UNITS_PER_WORD
;
3352 /* C6 - C9. though the sign and zero extension semantics are
3353 handled elsewhere. This is the case where the argument fits
3354 entirely general registers. */
3355 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3358 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3360 /* C.8 if the argument has an alignment of 16 then the NGRN is
3361 rounded up to the next even number. */
3364 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3365 comparison is there because for > 16 * BITS_PER_UNIT
3366 alignment nregs should be > 2 and therefore it should be
3367 passed by reference rather than value. */
3368 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3371 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3374 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3375 A reg is still generated for it, but the caller should be smart
3376 enough not to use it. */
3377 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3378 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3384 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3385 for (i
= 0; i
< nregs
; i
++)
3387 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3388 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3389 GEN_INT (i
* UNITS_PER_WORD
));
3390 XVECEXP (par
, 0, i
) = tmp
;
3392 pcum
->aapcs_reg
= par
;
3395 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3400 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3402 /* The argument is passed on stack; record the needed number of words for
3403 this argument and align the total size if necessary. */
3405 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3407 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3408 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3409 16 / UNITS_PER_WORD
);
3413 /* Implement TARGET_FUNCTION_ARG. */
3416 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3417 const_tree type
, bool named
)
3419 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3420 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3422 if (mode
== VOIDmode
)
3425 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3426 return pcum
->aapcs_reg
;
3430 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3431 const_tree fntype ATTRIBUTE_UNUSED
,
3432 rtx libname ATTRIBUTE_UNUSED
,
3433 const_tree fndecl ATTRIBUTE_UNUSED
,
3434 unsigned n_named ATTRIBUTE_UNUSED
)
3436 pcum
->aapcs_ncrn
= 0;
3437 pcum
->aapcs_nvrn
= 0;
3438 pcum
->aapcs_nextncrn
= 0;
3439 pcum
->aapcs_nextnvrn
= 0;
3440 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3441 pcum
->aapcs_reg
= NULL_RTX
;
3442 pcum
->aapcs_arg_processed
= false;
3443 pcum
->aapcs_stack_words
= 0;
3444 pcum
->aapcs_stack_size
= 0;
3447 && fndecl
&& TREE_PUBLIC (fndecl
)
3448 && fntype
&& fntype
!= error_mark_node
)
3450 const_tree type
= TREE_TYPE (fntype
);
3451 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3452 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3453 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3454 &mode
, &nregs
, NULL
))
3455 aarch64_err_no_fpadvsimd (TYPE_MODE (type
), "return type");
3461 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3466 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3467 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3469 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3470 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3471 != (pcum
->aapcs_stack_words
!= 0));
3472 pcum
->aapcs_arg_processed
= false;
3473 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3474 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3475 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3476 pcum
->aapcs_stack_words
= 0;
3477 pcum
->aapcs_reg
= NULL_RTX
;
3482 aarch64_function_arg_regno_p (unsigned regno
)
3484 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3485 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3488 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3489 PARM_BOUNDARY bits of alignment, but will be given anything up
3490 to STACK_BOUNDARY bits if the type requires it. This makes sure
3491 that both before and after the layout of each argument, the Next
3492 Stacked Argument Address (NSAA) will have a minimum alignment of
3496 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3498 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3499 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3502 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3504 static fixed_size_mode
3505 aarch64_get_reg_raw_mode (int regno
)
3507 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3508 /* Don't use the SVE part of the register for __builtin_apply and
3509 __builtin_return. The SVE registers aren't used by the normal PCS,
3510 so using them there would be a waste of time. The PCS extensions
3511 for SVE types are fundamentally incompatible with the
3512 __builtin_return/__builtin_apply interface. */
3513 return as_a
<fixed_size_mode
> (V16QImode
);
3514 return default_get_reg_raw_mode (regno
);
3517 /* Implement TARGET_FUNCTION_ARG_PADDING.
3519 Small aggregate types are placed in the lowest memory address.
3521 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3523 static pad_direction
3524 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3526 /* On little-endian targets, the least significant byte of every stack
3527 argument is passed at the lowest byte address of the stack slot. */
3528 if (!BYTES_BIG_ENDIAN
)
3531 /* Otherwise, integral, floating-point and pointer types are padded downward:
3532 the least significant byte of a stack argument is passed at the highest
3533 byte address of the stack slot. */
3535 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3536 || POINTER_TYPE_P (type
))
3537 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3538 return PAD_DOWNWARD
;
3540 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3544 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3546 It specifies padding for the last (may also be the only)
3547 element of a block move between registers and memory. If
3548 assuming the block is in the memory, padding upward means that
3549 the last element is padded after its highest significant byte,
3550 while in downward padding, the last element is padded at the
3551 its least significant byte side.
3553 Small aggregates and small complex types are always padded
3556 We don't need to worry about homogeneous floating-point or
3557 short-vector aggregates; their move is not affected by the
3558 padding direction determined here. Regardless of endianness,
3559 each element of such an aggregate is put in the least
3560 significant bits of a fp/simd register.
3562 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3563 register has useful data, and return the opposite if the most
3564 significant byte does. */
3567 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3568 bool first ATTRIBUTE_UNUSED
)
3571 /* Small composite types are always padded upward. */
3572 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3576 size
= int_size_in_bytes (type
);
3578 /* No frontends can create types with variable-sized modes, so we
3579 shouldn't be asked to pass or return them. */
3580 size
= GET_MODE_SIZE (mode
).to_constant ();
3581 if (size
< 2 * UNITS_PER_WORD
)
3585 /* Otherwise, use the default padding. */
3586 return !BYTES_BIG_ENDIAN
;
3589 static scalar_int_mode
3590 aarch64_libgcc_cmp_return_mode (void)
3595 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3597 /* We use the 12-bit shifted immediate arithmetic instructions so values
3598 must be multiple of (1 << 12), i.e. 4096. */
3599 #define ARITH_FACTOR 4096
3601 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3602 #error Cannot use simple address calculation for stack probing
3605 /* The pair of scratch registers used for stack probing. */
3606 #define PROBE_STACK_FIRST_REG 9
3607 #define PROBE_STACK_SECOND_REG 10
3609 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3610 inclusive. These are offsets from the current stack pointer. */
3613 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3616 if (!poly_size
.is_constant (&size
))
3618 sorry ("stack probes for SVE frames");
3622 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3624 /* See the same assertion on PROBE_INTERVAL above. */
3625 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3627 /* See if we have a constant small number of probes to generate. If so,
3628 that's the easy case. */
3629 if (size
<= PROBE_INTERVAL
)
3631 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3633 emit_set_insn (reg1
,
3634 plus_constant (Pmode
,
3635 stack_pointer_rtx
, -(first
+ base
)));
3636 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3639 /* The run-time loop is made up of 8 insns in the generic case while the
3640 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3641 else if (size
<= 4 * PROBE_INTERVAL
)
3643 HOST_WIDE_INT i
, rem
;
3645 emit_set_insn (reg1
,
3646 plus_constant (Pmode
,
3648 -(first
+ PROBE_INTERVAL
)));
3649 emit_stack_probe (reg1
);
3651 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3652 it exceeds SIZE. If only two probes are needed, this will not
3653 generate any code. Then probe at FIRST + SIZE. */
3654 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3656 emit_set_insn (reg1
,
3657 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3658 emit_stack_probe (reg1
);
3661 rem
= size
- (i
- PROBE_INTERVAL
);
3664 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3666 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3667 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3670 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3673 /* Otherwise, do the same as above, but in a loop. Note that we must be
3674 extra careful with variables wrapping around because we might be at
3675 the very top (or the very bottom) of the address space and we have
3676 to be able to handle this case properly; in particular, we use an
3677 equality test for the loop condition. */
3680 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3682 /* Step 1: round SIZE to the previous multiple of the interval. */
3684 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3687 /* Step 2: compute initial and final value of the loop counter. */
3689 /* TEST_ADDR = SP + FIRST. */
3690 emit_set_insn (reg1
,
3691 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3693 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3694 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3695 if (! aarch64_uimm12_shift (adjustment
))
3697 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3699 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3703 emit_set_insn (reg2
,
3704 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3711 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3714 while (TEST_ADDR != LAST_ADDR)
3716 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3717 until it is equal to ROUNDED_SIZE. */
3719 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3722 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3723 that SIZE is equal to ROUNDED_SIZE. */
3725 if (size
!= rounded_size
)
3727 HOST_WIDE_INT rem
= size
- rounded_size
;
3731 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3733 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3734 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3737 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3741 /* Make sure nothing is scheduled before we are done. */
3742 emit_insn (gen_blockage ());
3745 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3746 absolute addresses. */
3749 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3751 static int labelno
= 0;
3755 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3758 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3760 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3762 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3763 output_asm_insn ("sub\t%0, %0, %1", xops
);
3765 /* Probe at TEST_ADDR. */
3766 output_asm_insn ("str\txzr, [%0]", xops
);
3768 /* Test if TEST_ADDR == LAST_ADDR. */
3770 output_asm_insn ("cmp\t%0, %1", xops
);
3773 fputs ("\tb.ne\t", asm_out_file
);
3774 assemble_name_raw (asm_out_file
, loop_lab
);
3775 fputc ('\n', asm_out_file
);
3780 /* Mark the registers that need to be saved by the callee and calculate
3781 the size of the callee-saved registers area and frame record (both FP
3782 and LR may be omitted). */
3784 aarch64_layout_frame (void)
3786 HOST_WIDE_INT offset
= 0;
3787 int regno
, last_fp_reg
= INVALID_REGNUM
;
3789 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
3792 /* Force a frame chain for EH returns so the return address is at FP+8. */
3793 cfun
->machine
->frame
.emit_frame_chain
3794 = frame_pointer_needed
|| crtl
->calls_eh_return
;
3796 /* Emit a frame chain if the frame pointer is enabled.
3797 If -momit-leaf-frame-pointer is used, do not use a frame chain
3798 in leaf functions which do not use LR. */
3799 if (flag_omit_frame_pointer
== 2
3800 && !(flag_omit_leaf_frame_pointer
&& crtl
->is_leaf
3801 && !df_regs_ever_live_p (LR_REGNUM
)))
3802 cfun
->machine
->frame
.emit_frame_chain
= true;
3804 #define SLOT_NOT_REQUIRED (-2)
3805 #define SLOT_REQUIRED (-1)
3807 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
3808 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
3810 /* First mark all the registers that really need to be saved... */
3811 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3812 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
3814 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
3815 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
3817 /* ... that includes the eh data registers (if needed)... */
3818 if (crtl
->calls_eh_return
)
3819 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
3820 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
3823 /* ... and any callee saved register that dataflow says is live. */
3824 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3825 if (df_regs_ever_live_p (regno
)
3826 && (regno
== R30_REGNUM
3827 || !call_used_regs
[regno
]))
3828 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
3830 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
3831 if (df_regs_ever_live_p (regno
)
3832 && !call_used_regs
[regno
])
3834 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
3835 last_fp_reg
= regno
;
3838 if (cfun
->machine
->frame
.emit_frame_chain
)
3840 /* FP and LR are placed in the linkage record. */
3841 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
3842 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
3843 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
3844 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
3845 offset
= 2 * UNITS_PER_WORD
;
3848 /* Now assign stack slots for them. */
3849 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3850 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
3852 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
3853 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
3854 cfun
->machine
->frame
.wb_candidate1
= regno
;
3855 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
3856 cfun
->machine
->frame
.wb_candidate2
= regno
;
3857 offset
+= UNITS_PER_WORD
;
3860 HOST_WIDE_INT max_int_offset
= offset
;
3861 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
3862 bool has_align_gap
= offset
!= max_int_offset
;
3864 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
3865 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
3867 /* If there is an alignment gap between integer and fp callee-saves,
3868 allocate the last fp register to it if possible. */
3869 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
3871 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
3875 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
3876 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
3877 cfun
->machine
->frame
.wb_candidate1
= regno
;
3878 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
3879 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
3880 cfun
->machine
->frame
.wb_candidate2
= regno
;
3881 offset
+= UNITS_PER_WORD
;
3884 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
3886 cfun
->machine
->frame
.saved_regs_size
= offset
;
3888 HOST_WIDE_INT varargs_and_saved_regs_size
3889 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
3891 cfun
->machine
->frame
.hard_fp_offset
3892 = aligned_upper_bound (varargs_and_saved_regs_size
3893 + get_frame_size (),
3894 STACK_BOUNDARY
/ BITS_PER_UNIT
);
3896 /* Both these values are already aligned. */
3897 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
3898 STACK_BOUNDARY
/ BITS_PER_UNIT
));
3899 cfun
->machine
->frame
.frame_size
3900 = (cfun
->machine
->frame
.hard_fp_offset
3901 + crtl
->outgoing_args_size
);
3903 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
3905 cfun
->machine
->frame
.initial_adjust
= 0;
3906 cfun
->machine
->frame
.final_adjust
= 0;
3907 cfun
->machine
->frame
.callee_adjust
= 0;
3908 cfun
->machine
->frame
.callee_offset
= 0;
3910 HOST_WIDE_INT max_push_offset
= 0;
3911 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
3912 max_push_offset
= 512;
3913 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
3914 max_push_offset
= 256;
3916 HOST_WIDE_INT const_size
, const_fp_offset
;
3917 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
3918 && const_size
< max_push_offset
3919 && known_eq (crtl
->outgoing_args_size
, 0))
3921 /* Simple, small frame with no outgoing arguments:
3922 stp reg1, reg2, [sp, -frame_size]!
3923 stp reg3, reg4, [sp, 16] */
3924 cfun
->machine
->frame
.callee_adjust
= const_size
;
3926 else if (known_lt (crtl
->outgoing_args_size
3927 + cfun
->machine
->frame
.saved_regs_size
, 512)
3928 && !(cfun
->calls_alloca
3929 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
3932 /* Frame with small outgoing arguments:
3933 sub sp, sp, frame_size
3934 stp reg1, reg2, [sp, outgoing_args_size]
3935 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3936 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
3937 cfun
->machine
->frame
.callee_offset
3938 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
3940 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
3941 && const_fp_offset
< max_push_offset
)
3943 /* Frame with large outgoing arguments but a small local area:
3944 stp reg1, reg2, [sp, -hard_fp_offset]!
3945 stp reg3, reg4, [sp, 16]
3946 sub sp, sp, outgoing_args_size */
3947 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
3948 cfun
->machine
->frame
.final_adjust
3949 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
3953 /* Frame with large local area and outgoing arguments using frame pointer:
3954 sub sp, sp, hard_fp_offset
3955 stp x29, x30, [sp, 0]
3957 stp reg3, reg4, [sp, 16]
3958 sub sp, sp, outgoing_args_size */
3959 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
3960 cfun
->machine
->frame
.final_adjust
3961 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
3964 cfun
->machine
->frame
.laid_out
= true;
3967 /* Return true if the register REGNO is saved on entry to
3968 the current function. */
3971 aarch64_register_saved_on_entry (int regno
)
3973 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
3976 /* Return the next register up from REGNO up to LIMIT for the callee
3980 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
3982 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
3987 /* Push the register number REGNO of mode MODE to the stack with write-back
3988 adjusting the stack by ADJUSTMENT. */
3991 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
3992 HOST_WIDE_INT adjustment
)
3994 rtx base_rtx
= stack_pointer_rtx
;
3997 reg
= gen_rtx_REG (mode
, regno
);
3998 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
3999 plus_constant (Pmode
, base_rtx
, -adjustment
));
4000 mem
= gen_frame_mem (mode
, mem
);
4002 insn
= emit_move_insn (mem
, reg
);
4003 RTX_FRAME_RELATED_P (insn
) = 1;
4006 /* Generate and return an instruction to store the pair of registers
4007 REG and REG2 of mode MODE to location BASE with write-back adjusting
4008 the stack location BASE by ADJUSTMENT. */
4011 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4012 HOST_WIDE_INT adjustment
)
4017 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4018 GEN_INT (-adjustment
),
4019 GEN_INT (UNITS_PER_WORD
- adjustment
));
4021 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4022 GEN_INT (-adjustment
),
4023 GEN_INT (UNITS_PER_WORD
- adjustment
));
4029 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4030 stack pointer by ADJUSTMENT. */
4033 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4036 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4038 if (regno2
== INVALID_REGNUM
)
4039 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4041 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4042 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4044 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4046 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4047 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4048 RTX_FRAME_RELATED_P (insn
) = 1;
4051 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4052 adjusting it by ADJUSTMENT afterwards. */
4055 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4056 HOST_WIDE_INT adjustment
)
4061 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4062 GEN_INT (UNITS_PER_WORD
));
4064 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4065 GEN_INT (UNITS_PER_WORD
));
4071 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4072 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4076 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4079 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4080 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4082 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4084 if (regno2
== INVALID_REGNUM
)
4086 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4087 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4088 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4092 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4093 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4094 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4099 /* Generate and return a store pair instruction of mode MODE to store
4100 register REG1 to MEM1 and register REG2 to MEM2. */
4103 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4109 return gen_store_pairdi (mem1
, reg1
, mem2
, reg2
);
4112 return gen_store_pairdf (mem1
, reg1
, mem2
, reg2
);
4119 /* Generate and regurn a load pair isntruction of mode MODE to load register
4120 REG1 from MEM1 and register REG2 from MEM2. */
4123 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4129 return gen_load_pairdi (reg1
, mem1
, reg2
, mem2
);
4132 return gen_load_pairdf (reg1
, mem1
, reg2
, mem2
);
4139 /* Return TRUE if return address signing should be enabled for the current
4140 function, otherwise return FALSE. */
4143 aarch64_return_address_signing_enabled (void)
4145 /* This function should only be called after frame laid out. */
4146 gcc_assert (cfun
->machine
->frame
.laid_out
);
4148 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4149 if it's LR is pushed onto stack. */
4150 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4151 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4152 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4155 /* Emit code to save the callee-saved registers from register number START
4156 to LIMIT to the stack at the location starting at offset START_OFFSET,
4157 skipping any write-back candidates if SKIP_WB is true. */
4160 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4161 unsigned start
, unsigned limit
, bool skip_wb
)
4167 for (regno
= aarch64_next_callee_save (start
, limit
);
4169 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4175 && (regno
== cfun
->machine
->frame
.wb_candidate1
4176 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4179 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4182 reg
= gen_rtx_REG (mode
, regno
);
4183 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4184 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4187 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4190 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4191 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4192 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4195 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4198 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4199 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4201 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4204 /* The first part of a frame-related parallel insn is
4205 always assumed to be relevant to the frame
4206 calculations; subsequent parts, are only
4207 frame-related if explicitly marked. */
4208 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4212 insn
= emit_move_insn (mem
, reg
);
4214 RTX_FRAME_RELATED_P (insn
) = 1;
4218 /* Emit code to restore the callee registers of mode MODE from register
4219 number START up to and including LIMIT. Restore from the stack offset
4220 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4221 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4224 aarch64_restore_callee_saves (machine_mode mode
,
4225 poly_int64 start_offset
, unsigned start
,
4226 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4228 rtx base_rtx
= stack_pointer_rtx
;
4233 for (regno
= aarch64_next_callee_save (start
, limit
);
4235 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4237 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4243 && (regno
== cfun
->machine
->frame
.wb_candidate1
4244 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4247 reg
= gen_rtx_REG (mode
, regno
);
4248 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4249 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4251 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4254 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4255 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4256 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4258 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4261 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4262 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4263 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4265 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4269 emit_move_insn (reg
, mem
);
4270 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4274 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4278 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4280 HOST_WIDE_INT multiple
;
4281 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4282 && IN_RANGE (multiple
, -8, 7));
4285 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4289 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4291 HOST_WIDE_INT multiple
;
4292 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4293 && IN_RANGE (multiple
, 0, 63));
4296 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4300 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4302 HOST_WIDE_INT multiple
;
4303 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4304 && IN_RANGE (multiple
, -64, 63));
4307 /* Return true if OFFSET is a signed 9-bit value. */
4310 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4313 HOST_WIDE_INT const_offset
;
4314 return (offset
.is_constant (&const_offset
)
4315 && IN_RANGE (const_offset
, -256, 255));
4318 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4322 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4324 HOST_WIDE_INT multiple
;
4325 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4326 && IN_RANGE (multiple
, -256, 255));
4329 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4333 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4335 HOST_WIDE_INT multiple
;
4336 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4337 && IN_RANGE (multiple
, 0, 4095));
4340 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4343 aarch64_get_separate_components (void)
4345 aarch64_layout_frame ();
4347 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4348 bitmap_clear (components
);
4350 /* The registers we need saved to the frame. */
4351 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4352 if (aarch64_register_saved_on_entry (regno
))
4354 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4355 if (!frame_pointer_needed
)
4356 offset
+= cfun
->machine
->frame
.frame_size
4357 - cfun
->machine
->frame
.hard_fp_offset
;
4358 /* Check that we can access the stack slot of the register with one
4359 direct load with no adjustments needed. */
4360 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4361 bitmap_set_bit (components
, regno
);
4364 /* Don't mess with the hard frame pointer. */
4365 if (frame_pointer_needed
)
4366 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4368 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4369 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4370 /* If aarch64_layout_frame has chosen registers to store/restore with
4371 writeback don't interfere with them to avoid having to output explicit
4372 stack adjustment instructions. */
4373 if (reg2
!= INVALID_REGNUM
)
4374 bitmap_clear_bit (components
, reg2
);
4375 if (reg1
!= INVALID_REGNUM
)
4376 bitmap_clear_bit (components
, reg1
);
4378 bitmap_clear_bit (components
, LR_REGNUM
);
4379 bitmap_clear_bit (components
, SP_REGNUM
);
4384 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4387 aarch64_components_for_bb (basic_block bb
)
4389 bitmap in
= DF_LIVE_IN (bb
);
4390 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4391 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4393 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4394 bitmap_clear (components
);
4396 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4397 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4398 if ((!call_used_regs
[regno
])
4399 && (bitmap_bit_p (in
, regno
)
4400 || bitmap_bit_p (gen
, regno
)
4401 || bitmap_bit_p (kill
, regno
)))
4402 bitmap_set_bit (components
, regno
);
4407 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4408 Nothing to do for aarch64. */
4411 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4415 /* Return the next set bit in BMP from START onwards. Return the total number
4416 of bits in BMP if no set bit is found at or after START. */
4419 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4421 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4425 gcc_assert (start
< nbits
);
4426 for (unsigned int i
= start
; i
< nbits
; i
++)
4427 if (bitmap_bit_p (bmp
, i
))
4433 /* Do the work for aarch64_emit_prologue_components and
4434 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4435 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4436 for these components or the epilogue sequence. That is, it determines
4437 whether we should emit stores or loads and what kind of CFA notes to attach
4438 to the insns. Otherwise the logic for the two sequences is very
4442 aarch64_process_components (sbitmap components
, bool prologue_p
)
4444 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4445 ? HARD_FRAME_POINTER_REGNUM
4446 : STACK_POINTER_REGNUM
);
4448 unsigned last_regno
= SBITMAP_SIZE (components
);
4449 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4450 rtx_insn
*insn
= NULL
;
4452 while (regno
!= last_regno
)
4454 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4455 so DFmode for the vector registers is enough. */
4456 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4457 rtx reg
= gen_rtx_REG (mode
, regno
);
4458 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4459 if (!frame_pointer_needed
)
4460 offset
+= cfun
->machine
->frame
.frame_size
4461 - cfun
->machine
->frame
.hard_fp_offset
;
4462 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4463 rtx mem
= gen_frame_mem (mode
, addr
);
4465 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4466 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4467 /* No more registers to handle after REGNO.
4468 Emit a single save/restore and exit. */
4469 if (regno2
== last_regno
)
4471 insn
= emit_insn (set
);
4472 RTX_FRAME_RELATED_P (insn
) = 1;
4474 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4476 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4480 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4481 /* The next register is not of the same class or its offset is not
4482 mergeable with the current one into a pair. */
4483 if (!satisfies_constraint_Ump (mem
)
4484 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4485 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4486 GET_MODE_SIZE (mode
)))
4488 insn
= emit_insn (set
);
4489 RTX_FRAME_RELATED_P (insn
) = 1;
4491 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4493 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4499 /* REGNO2 can be saved/restored in a pair with REGNO. */
4500 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4501 if (!frame_pointer_needed
)
4502 offset2
+= cfun
->machine
->frame
.frame_size
4503 - cfun
->machine
->frame
.hard_fp_offset
;
4504 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4505 rtx mem2
= gen_frame_mem (mode
, addr2
);
4506 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4507 : gen_rtx_SET (reg2
, mem2
);
4510 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4512 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4514 RTX_FRAME_RELATED_P (insn
) = 1;
4517 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4518 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4522 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4523 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4526 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4530 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4533 aarch64_emit_prologue_components (sbitmap components
)
4535 aarch64_process_components (components
, true);
4538 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4541 aarch64_emit_epilogue_components (sbitmap components
)
4543 aarch64_process_components (components
, false);
4546 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4549 aarch64_set_handled_components (sbitmap components
)
4551 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4552 if (bitmap_bit_p (components
, regno
))
4553 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4557 is saved at BASE + OFFSET. */
4560 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4561 rtx base
, poly_int64 offset
)
4563 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4564 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4565 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4568 /* AArch64 stack frames generated by this compiler look like:
4570 +-------------------------------+
4572 | incoming stack arguments |
4574 +-------------------------------+
4575 | | <-- incoming stack pointer (aligned)
4576 | callee-allocated save area |
4577 | for register varargs |
4579 +-------------------------------+
4580 | local variables | <-- frame_pointer_rtx
4582 +-------------------------------+
4584 +-------------------------------+ |
4585 | callee-saved registers | | frame.saved_regs_size
4586 +-------------------------------+ |
4588 +-------------------------------+ |
4589 | FP' | / <- hard_frame_pointer_rtx (aligned)
4590 +-------------------------------+
4591 | dynamic allocation |
4592 +-------------------------------+
4594 +-------------------------------+
4595 | outgoing stack arguments | <-- arg_pointer
4597 +-------------------------------+
4598 | | <-- stack_pointer_rtx (aligned)
4600 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4601 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4604 /* Generate the prologue instructions for entry into a function.
4605 Establish the stack frame by decreasing the stack pointer with a
4606 properly calculated size and, if necessary, create a frame record
4607 filled with the values of LR and previous frame pointer. The
4608 current FP is also set up if it is in use. */
4611 aarch64_expand_prologue (void)
4613 aarch64_layout_frame ();
4615 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4616 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4617 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4618 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4619 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4620 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4621 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4622 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4625 /* Sign return address for functions. */
4626 if (aarch64_return_address_signing_enabled ())
4628 insn
= emit_insn (gen_pacisp ());
4629 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4630 RTX_FRAME_RELATED_P (insn
) = 1;
4633 if (flag_stack_usage_info
)
4634 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4636 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4638 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4640 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4641 && maybe_gt (frame_size
, get_stack_check_protect ()))
4642 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4644 - get_stack_check_protect ()));
4646 else if (maybe_gt (frame_size
, 0))
4647 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4650 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4651 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4653 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4655 if (callee_adjust
!= 0)
4656 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4658 if (emit_frame_chain
)
4660 poly_int64 reg_offset
= callee_adjust
;
4661 if (callee_adjust
== 0)
4665 reg_offset
= callee_offset
;
4666 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4668 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4669 stack_pointer_rtx
, callee_offset
,
4670 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4671 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4673 /* Variable-sized frames need to describe the save slot
4674 address using DW_CFA_expression rather than DW_CFA_offset.
4675 This means that, without taking further action, the
4676 locations of the registers that we've already saved would
4677 remain based on the stack pointer even after we redefine
4678 the CFA based on the frame pointer. We therefore need new
4679 DW_CFA_expressions to re-express the save slots with addresses
4680 based on the frame pointer. */
4681 rtx_insn
*insn
= get_last_insn ();
4682 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4684 /* Add an explicit CFA definition if this was previously
4686 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4688 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4690 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4691 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4694 /* Change the save slot expressions for the registers that
4695 we've already saved. */
4696 reg_offset
-= callee_offset
;
4697 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4698 reg_offset
+ UNITS_PER_WORD
);
4699 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4702 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4705 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4706 callee_adjust
!= 0 || emit_frame_chain
);
4707 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4708 callee_adjust
!= 0 || emit_frame_chain
);
4709 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4712 /* Return TRUE if we can use a simple_return insn.
4714 This function checks whether the callee saved stack is empty, which
4715 means no restore actions are need. The pro_and_epilogue will use
4716 this to check whether shrink-wrapping opt is feasible. */
4719 aarch64_use_return_insn_p (void)
4721 if (!reload_completed
)
4727 aarch64_layout_frame ();
4729 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4732 /* Generate the epilogue instructions for returning from a function.
4733 This is almost exactly the reverse of the prolog sequence, except
4734 that we need to insert barriers to avoid scheduling loads that read
4735 from a deallocated stack, and we optimize the unwind records by
4736 emitting them all together if possible. */
4738 aarch64_expand_epilogue (bool for_sibcall
)
4740 aarch64_layout_frame ();
4742 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4743 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4744 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4745 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4746 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4747 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4750 /* A stack clash protection prologue may not have left IP0_REGNUM or
4751 IP1_REGNUM in a usable state. The same is true for allocations
4752 with an SVE component, since we then need both temporary registers
4753 for each allocation. */
4754 bool can_inherit_p
= (initial_adjust
.is_constant ()
4755 && final_adjust
.is_constant ()
4756 && !flag_stack_clash_protection
);
4758 /* We need to add memory barrier to prevent read from deallocated stack. */
4760 = maybe_ne (get_frame_size ()
4761 + cfun
->machine
->frame
.saved_varargs_size
, 0);
4763 /* Emit a barrier to prevent loads from a deallocated stack. */
4764 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
4765 || cfun
->calls_alloca
4766 || crtl
->calls_eh_return
)
4768 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
4769 need_barrier_p
= false;
4772 /* Restore the stack pointer from the frame pointer if it may not
4773 be the same as the stack pointer. */
4774 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4775 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4776 if (frame_pointer_needed
4777 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
4778 /* If writeback is used when restoring callee-saves, the CFA
4779 is restored on the instruction doing the writeback. */
4780 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
4781 hard_frame_pointer_rtx
, -callee_offset
,
4782 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
4784 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
4785 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
4787 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4788 callee_adjust
!= 0, &cfi_ops
);
4789 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4790 callee_adjust
!= 0, &cfi_ops
);
4793 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
4795 if (callee_adjust
!= 0)
4796 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
4798 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
4800 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4801 insn
= get_last_insn ();
4802 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
4803 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
4804 RTX_FRAME_RELATED_P (insn
) = 1;
4808 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
4809 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
4813 /* Emit delayed restores and reset the CFA to be SP. */
4814 insn
= get_last_insn ();
4815 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
4816 REG_NOTES (insn
) = cfi_ops
;
4817 RTX_FRAME_RELATED_P (insn
) = 1;
4820 /* We prefer to emit the combined return/authenticate instruction RETAA,
4821 however there are three cases in which we must instead emit an explicit
4822 authentication instruction.
4824 1) Sibcalls don't return in a normal way, so if we're about to call one
4825 we must authenticate.
4827 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4828 generating code for !TARGET_ARMV8_3 we can't use it and must
4829 explicitly authenticate.
4831 3) On an eh_return path we make extra stack adjustments to update the
4832 canonical frame address to be the exception handler's CFA. We want
4833 to authenticate using the CFA of the function which calls eh_return.
4835 if (aarch64_return_address_signing_enabled ()
4836 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
4838 insn
= emit_insn (gen_autisp ());
4839 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4840 RTX_FRAME_RELATED_P (insn
) = 1;
4843 /* Stack adjustment for exception handler. */
4844 if (crtl
->calls_eh_return
)
4846 /* We need to unwind the stack by the offset computed by
4847 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
4848 to be SP; letting the CFA move during this adjustment
4849 is just as correct as retaining the CFA from the body
4850 of the function. Therefore, do nothing special. */
4851 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
4854 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
4856 emit_jump_insn (ret_rtx
);
4859 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
4860 normally or return to a previous frame after unwinding.
4862 An EH return uses a single shared return sequence. The epilogue is
4863 exactly like a normal epilogue except that it has an extra input
4864 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
4865 that must be applied after the frame has been destroyed. An extra label
4866 is inserted before the epilogue which initializes this register to zero,
4867 and this is the entry point for a normal return.
4869 An actual EH return updates the return address, initializes the stack
4870 adjustment and jumps directly into the epilogue (bypassing the zeroing
4871 of the adjustment). Since the return address is typically saved on the
4872 stack when a function makes a call, the saved LR must be updated outside
4875 This poses problems as the store is generated well before the epilogue,
4876 so the offset of LR is not known yet. Also optimizations will remove the
4877 store as it appears dead, even after the epilogue is generated (as the
4878 base or offset for loading LR is different in many cases).
4880 To avoid these problems this implementation forces the frame pointer
4881 in eh_return functions so that the location of LR is fixed and known early.
4882 It also marks the store volatile, so no optimization is permitted to
4883 remove the store. */
4885 aarch64_eh_return_handler_rtx (void)
4887 rtx tmp
= gen_frame_mem (Pmode
,
4888 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
4890 /* Mark the store volatile, so no optimization is permitted to remove it. */
4891 MEM_VOLATILE_P (tmp
) = true;
4895 /* Output code to add DELTA to the first argument, and then jump
4896 to FUNCTION. Used for C++ multiple inheritance. */
4898 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
4899 HOST_WIDE_INT delta
,
4900 HOST_WIDE_INT vcall_offset
,
4903 /* The this pointer is always in x0. Note that this differs from
4904 Arm where the this pointer maybe bumped to r1 if r0 is required
4905 to return a pointer to an aggregate. On AArch64 a result value
4906 pointer will be in x8. */
4907 int this_regno
= R0_REGNUM
;
4908 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
4911 reload_completed
= 1;
4912 emit_note (NOTE_INSN_PROLOGUE_END
);
4914 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
4915 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4916 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4918 if (vcall_offset
== 0)
4919 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
4922 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
4927 if (delta
>= -256 && delta
< 256)
4928 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
4929 plus_constant (Pmode
, this_rtx
, delta
));
4931 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
4932 temp1
, temp0
, false);
4935 if (Pmode
== ptr_mode
)
4936 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
4938 aarch64_emit_move (temp0
,
4939 gen_rtx_ZERO_EXTEND (Pmode
,
4940 gen_rtx_MEM (ptr_mode
, addr
)));
4942 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
4943 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
4946 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
4948 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
4951 if (Pmode
== ptr_mode
)
4952 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
4954 aarch64_emit_move (temp1
,
4955 gen_rtx_SIGN_EXTEND (Pmode
,
4956 gen_rtx_MEM (ptr_mode
, addr
)));
4958 emit_insn (gen_add2_insn (this_rtx
, temp1
));
4961 /* Generate a tail call to the target function. */
4962 if (!TREE_USED (function
))
4964 assemble_external (function
);
4965 TREE_USED (function
) = 1;
4967 funexp
= XEXP (DECL_RTL (function
), 0);
4968 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
4969 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
4970 SIBLING_CALL_P (insn
) = 1;
4972 insn
= get_insns ();
4973 shorten_branches (insn
);
4974 final_start_function (insn
, file
, 1);
4975 final (insn
, file
, 1);
4976 final_end_function ();
4978 /* Stop pretending to be a post-reload pass. */
4979 reload_completed
= 0;
4983 aarch64_tls_referenced_p (rtx x
)
4985 if (!TARGET_HAVE_TLS
)
4987 subrtx_iterator::array_type array
;
4988 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
4990 const_rtx x
= *iter
;
4991 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
4993 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
4994 TLS offsets, not real symbol references. */
4995 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
4996 iter
.skip_subrtxes ();
5002 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5003 a left shift of 0 or 12 bits. */
5005 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5007 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5008 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5013 /* Return true if val is an immediate that can be loaded into a
5014 register by a MOVZ instruction. */
5016 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5018 if (GET_MODE_SIZE (mode
) > 4)
5020 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5021 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5026 /* Ignore sign extension. */
5027 val
&= (HOST_WIDE_INT
) 0xffffffff;
5029 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5030 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5033 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5034 64-bit (DImode) integer. */
5036 static unsigned HOST_WIDE_INT
5037 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5039 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5042 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5049 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5051 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5053 0x0000000100000001ull
,
5054 0x0001000100010001ull
,
5055 0x0101010101010101ull
,
5056 0x1111111111111111ull
,
5057 0x5555555555555555ull
,
5061 /* Return true if val is a valid bitmask immediate. */
5064 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5066 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5069 /* Check for a single sequence of one bits and return quickly if so.
5070 The special cases of all ones and all zeroes returns false. */
5071 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5072 tmp
= val
+ (val
& -val
);
5074 if (tmp
== (tmp
& -tmp
))
5075 return (val
+ 1) > 1;
5077 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5079 val
= (val
<< 32) | (val
& 0xffffffff);
5081 /* Invert if the immediate doesn't start with a zero bit - this means we
5082 only need to search for sequences of one bits. */
5086 /* Find the first set bit and set tmp to val with the first sequence of one
5087 bits removed. Return success if there is a single sequence of ones. */
5088 first_one
= val
& -val
;
5089 tmp
= val
& (val
+ first_one
);
5094 /* Find the next set bit and compute the difference in bit position. */
5095 next_one
= tmp
& -tmp
;
5096 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5099 /* Check the bit position difference is a power of 2, and that the first
5100 sequence of one bits fits within 'bits' bits. */
5101 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5104 /* Check the sequence of one bits is repeated 64/bits times. */
5105 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5108 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5109 Assumed precondition: VAL_IN Is not zero. */
5111 unsigned HOST_WIDE_INT
5112 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5114 int lowest_bit_set
= ctz_hwi (val_in
);
5115 int highest_bit_set
= floor_log2 (val_in
);
5116 gcc_assert (val_in
!= 0);
5118 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5119 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5122 /* Create constant where bits outside of lowest bit set to highest bit set
5125 unsigned HOST_WIDE_INT
5126 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5128 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5131 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5134 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5136 scalar_int_mode int_mode
;
5137 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5140 if (aarch64_bitmask_imm (val_in
, int_mode
))
5143 if (aarch64_move_imm (val_in
, int_mode
))
5146 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5148 return aarch64_bitmask_imm (imm2
, int_mode
);
5151 /* Return true if val is an immediate that can be loaded into a
5152 register in a single instruction. */
5154 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5156 scalar_int_mode int_mode
;
5157 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5160 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5162 return aarch64_bitmask_imm (val
, int_mode
);
5166 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5170 if (GET_CODE (x
) == HIGH
)
5173 /* There's no way to calculate VL-based values using relocations. */
5174 subrtx_iterator::array_type array
;
5175 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5176 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5179 split_const (x
, &base
, &offset
);
5180 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5182 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5183 != SYMBOL_FORCE_TO_MEM
)
5186 /* Avoid generating a 64-bit relocation in ILP32; leave
5187 to aarch64_expand_mov_immediate to handle it properly. */
5188 return mode
!= ptr_mode
;
5191 return aarch64_tls_referenced_p (x
);
5194 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5195 The expansion for a table switch is quite expensive due to the number
5196 of instructions, the table lookup and hard to predict indirect jump.
5197 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5198 set, otherwise use tables for > 16 cases as a tradeoff between size and
5199 performance. When optimizing for size, use the default setting. */
5202 aarch64_case_values_threshold (void)
5204 /* Use the specified limit for the number of cases before using jump
5205 tables at higher optimization levels. */
5207 && selected_cpu
->tune
->max_case_values
!= 0)
5208 return selected_cpu
->tune
->max_case_values
;
5210 return optimize_size
? default_case_values_threshold () : 17;
5213 /* Return true if register REGNO is a valid index register.
5214 STRICT_P is true if REG_OK_STRICT is in effect. */
5217 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5219 if (!HARD_REGISTER_NUM_P (regno
))
5227 regno
= reg_renumber
[regno
];
5229 return GP_REGNUM_P (regno
);
5232 /* Return true if register REGNO is a valid base register for mode MODE.
5233 STRICT_P is true if REG_OK_STRICT is in effect. */
5236 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5238 if (!HARD_REGISTER_NUM_P (regno
))
5246 regno
= reg_renumber
[regno
];
5249 /* The fake registers will be eliminated to either the stack or
5250 hard frame pointer, both of which are usually valid base registers.
5251 Reload deals with the cases where the eliminated form isn't valid. */
5252 return (GP_REGNUM_P (regno
)
5253 || regno
== SP_REGNUM
5254 || regno
== FRAME_POINTER_REGNUM
5255 || regno
== ARG_POINTER_REGNUM
);
5258 /* Return true if X is a valid base register for mode MODE.
5259 STRICT_P is true if REG_OK_STRICT is in effect. */
5262 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5265 && GET_CODE (x
) == SUBREG
5266 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5269 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5272 /* Return true if address offset is a valid index. If it is, fill in INFO
5273 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5276 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5277 machine_mode mode
, bool strict_p
)
5279 enum aarch64_address_type type
;
5284 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5285 && GET_MODE (x
) == Pmode
)
5287 type
= ADDRESS_REG_REG
;
5291 /* (sign_extend:DI (reg:SI)) */
5292 else if ((GET_CODE (x
) == SIGN_EXTEND
5293 || GET_CODE (x
) == ZERO_EXTEND
)
5294 && GET_MODE (x
) == DImode
5295 && GET_MODE (XEXP (x
, 0)) == SImode
)
5297 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5298 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5299 index
= XEXP (x
, 0);
5302 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5303 else if (GET_CODE (x
) == MULT
5304 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5305 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5306 && GET_MODE (XEXP (x
, 0)) == DImode
5307 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5308 && CONST_INT_P (XEXP (x
, 1)))
5310 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5311 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5312 index
= XEXP (XEXP (x
, 0), 0);
5313 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5315 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5316 else if (GET_CODE (x
) == ASHIFT
5317 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5318 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5319 && GET_MODE (XEXP (x
, 0)) == DImode
5320 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5321 && CONST_INT_P (XEXP (x
, 1)))
5323 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5324 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5325 index
= XEXP (XEXP (x
, 0), 0);
5326 shift
= INTVAL (XEXP (x
, 1));
5328 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5329 else if ((GET_CODE (x
) == SIGN_EXTRACT
5330 || GET_CODE (x
) == ZERO_EXTRACT
)
5331 && GET_MODE (x
) == DImode
5332 && GET_CODE (XEXP (x
, 0)) == MULT
5333 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5334 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5336 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5337 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5338 index
= XEXP (XEXP (x
, 0), 0);
5339 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5340 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5341 || INTVAL (XEXP (x
, 2)) != 0)
5344 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5345 (const_int 0xffffffff<<shift)) */
5346 else if (GET_CODE (x
) == AND
5347 && GET_MODE (x
) == DImode
5348 && GET_CODE (XEXP (x
, 0)) == MULT
5349 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5350 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5351 && CONST_INT_P (XEXP (x
, 1)))
5353 type
= ADDRESS_REG_UXTW
;
5354 index
= XEXP (XEXP (x
, 0), 0);
5355 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5356 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5359 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5360 else if ((GET_CODE (x
) == SIGN_EXTRACT
5361 || GET_CODE (x
) == ZERO_EXTRACT
)
5362 && GET_MODE (x
) == DImode
5363 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5364 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5365 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5367 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5368 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5369 index
= XEXP (XEXP (x
, 0), 0);
5370 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5371 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5372 || INTVAL (XEXP (x
, 2)) != 0)
5375 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5376 (const_int 0xffffffff<<shift)) */
5377 else if (GET_CODE (x
) == AND
5378 && GET_MODE (x
) == DImode
5379 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5380 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5381 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5382 && CONST_INT_P (XEXP (x
, 1)))
5384 type
= ADDRESS_REG_UXTW
;
5385 index
= XEXP (XEXP (x
, 0), 0);
5386 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5387 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5390 /* (mult:P (reg:P) (const_int scale)) */
5391 else if (GET_CODE (x
) == MULT
5392 && GET_MODE (x
) == Pmode
5393 && GET_MODE (XEXP (x
, 0)) == Pmode
5394 && CONST_INT_P (XEXP (x
, 1)))
5396 type
= ADDRESS_REG_REG
;
5397 index
= XEXP (x
, 0);
5398 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5400 /* (ashift:P (reg:P) (const_int shift)) */
5401 else if (GET_CODE (x
) == ASHIFT
5402 && GET_MODE (x
) == Pmode
5403 && GET_MODE (XEXP (x
, 0)) == Pmode
5404 && CONST_INT_P (XEXP (x
, 1)))
5406 type
= ADDRESS_REG_REG
;
5407 index
= XEXP (x
, 0);
5408 shift
= INTVAL (XEXP (x
, 1));
5414 && GET_CODE (index
) == SUBREG
5415 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5416 index
= SUBREG_REG (index
);
5418 if (aarch64_sve_data_mode_p (mode
))
5420 if (type
!= ADDRESS_REG_REG
5421 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5427 && !(IN_RANGE (shift
, 1, 3)
5428 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5433 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5436 info
->offset
= index
;
5437 info
->shift
= shift
;
5444 /* Return true if MODE is one of the modes for which we
5445 support LDP/STP operations. */
5448 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5450 return mode
== SImode
|| mode
== DImode
5451 || mode
== SFmode
|| mode
== DFmode
5452 || (aarch64_vector_mode_supported_p (mode
)
5453 && known_eq (GET_MODE_SIZE (mode
), 8));
5456 /* Return true if REGNO is a virtual pointer register, or an eliminable
5457 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5458 include stack_pointer or hard_frame_pointer. */
5460 virt_or_elim_regno_p (unsigned regno
)
5462 return ((regno
>= FIRST_VIRTUAL_REGISTER
5463 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5464 || regno
== FRAME_POINTER_REGNUM
5465 || regno
== ARG_POINTER_REGNUM
);
5468 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5469 If it is, fill in INFO appropriately. STRICT_P is true if
5470 REG_OK_STRICT is in effect. */
5473 aarch64_classify_address (struct aarch64_address_info
*info
,
5474 rtx x
, machine_mode mode
, bool strict_p
,
5475 aarch64_addr_query_type type
= ADDR_QUERY_M
)
5477 enum rtx_code code
= GET_CODE (x
);
5481 HOST_WIDE_INT const_size
;
5483 /* On BE, we use load/store pair for all large int mode load/stores.
5484 TI/TFmode may also use a load/store pair. */
5485 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5486 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5487 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5490 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5492 bool allow_reg_index_p
= (!load_store_pair_p
5493 && (known_lt (GET_MODE_SIZE (mode
), 16)
5494 || vec_flags
== VEC_ADVSIMD
5495 || vec_flags
== VEC_SVE_DATA
));
5497 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5498 [Rn, #offset, MUL VL]. */
5499 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5500 && (code
!= REG
&& code
!= PLUS
))
5503 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5505 if (advsimd_struct_p
5506 && !BYTES_BIG_ENDIAN
5507 && (code
!= POST_INC
&& code
!= REG
))
5510 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5511 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5517 info
->type
= ADDRESS_REG_IMM
;
5519 info
->offset
= const0_rtx
;
5520 info
->const_offset
= 0;
5521 return aarch64_base_register_rtx_p (x
, strict_p
);
5529 && virt_or_elim_regno_p (REGNO (op0
))
5530 && poly_int_rtx_p (op1
, &offset
))
5532 info
->type
= ADDRESS_REG_IMM
;
5535 info
->const_offset
= offset
;
5540 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5541 && aarch64_base_register_rtx_p (op0
, strict_p
)
5542 && poly_int_rtx_p (op1
, &offset
))
5544 info
->type
= ADDRESS_REG_IMM
;
5547 info
->const_offset
= offset
;
5549 /* TImode and TFmode values are allowed in both pairs of X
5550 registers and individual Q registers. The available
5552 X,X: 7-bit signed scaled offset
5553 Q: 9-bit signed offset
5554 We conservatively require an offset representable in either mode.
5555 When performing the check for pairs of X registers i.e. LDP/STP
5556 pass down DImode since that is the natural size of the LDP/STP
5557 instruction memory accesses. */
5558 if (mode
== TImode
|| mode
== TFmode
)
5559 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5560 && (offset_9bit_signed_unscaled_p (mode
, offset
)
5561 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5563 /* A 7bit offset check because OImode will emit a ldp/stp
5564 instruction (only big endian will get here).
5565 For ldp/stp instructions, the offset is scaled for the size of a
5566 single element of the pair. */
5568 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5570 /* Three 9/12 bit offsets checks because CImode will emit three
5571 ldr/str instructions (only big endian will get here). */
5573 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5574 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
5575 || offset_12bit_unsigned_scaled_p (V16QImode
,
5578 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5579 instructions (only big endian will get here). */
5581 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5582 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5585 /* Make "m" use the LD1 offset range for SVE data modes, so
5586 that pre-RTL optimizers like ivopts will work to that
5587 instead of the wider LDR/STR range. */
5588 if (vec_flags
== VEC_SVE_DATA
)
5589 return (type
== ADDR_QUERY_M
5590 ? offset_4bit_signed_scaled_p (mode
, offset
)
5591 : offset_9bit_signed_scaled_p (mode
, offset
));
5593 if (vec_flags
== VEC_SVE_PRED
)
5594 return offset_9bit_signed_scaled_p (mode
, offset
);
5596 if (load_store_pair_p
)
5597 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5598 || known_eq (GET_MODE_SIZE (mode
), 8))
5599 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5601 return (offset_9bit_signed_unscaled_p (mode
, offset
)
5602 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5605 if (allow_reg_index_p
)
5607 /* Look for base + (scaled/extended) index register. */
5608 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5609 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5614 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5615 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5628 info
->type
= ADDRESS_REG_WB
;
5629 info
->base
= XEXP (x
, 0);
5630 info
->offset
= NULL_RTX
;
5631 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5635 info
->type
= ADDRESS_REG_WB
;
5636 info
->base
= XEXP (x
, 0);
5637 if (GET_CODE (XEXP (x
, 1)) == PLUS
5638 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5639 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5640 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5642 info
->offset
= XEXP (XEXP (x
, 1), 1);
5643 info
->const_offset
= offset
;
5645 /* TImode and TFmode values are allowed in both pairs of X
5646 registers and individual Q registers. The available
5648 X,X: 7-bit signed scaled offset
5649 Q: 9-bit signed offset
5650 We conservatively require an offset representable in either mode.
5652 if (mode
== TImode
|| mode
== TFmode
)
5653 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5654 && offset_9bit_signed_unscaled_p (mode
, offset
));
5656 if (load_store_pair_p
)
5657 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5658 || known_eq (GET_MODE_SIZE (mode
), 8))
5659 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5661 return offset_9bit_signed_unscaled_p (mode
, offset
);
5668 /* load literal: pc-relative constant pool entry. Only supported
5669 for SI mode or larger. */
5670 info
->type
= ADDRESS_SYMBOLIC
;
5672 if (!load_store_pair_p
5673 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5678 split_const (x
, &sym
, &addend
);
5679 return ((GET_CODE (sym
) == LABEL_REF
5680 || (GET_CODE (sym
) == SYMBOL_REF
5681 && CONSTANT_POOL_ADDRESS_P (sym
)
5682 && aarch64_pcrelative_literal_loads
)));
5687 info
->type
= ADDRESS_LO_SUM
;
5688 info
->base
= XEXP (x
, 0);
5689 info
->offset
= XEXP (x
, 1);
5690 if (allow_reg_index_p
5691 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5694 split_const (info
->offset
, &sym
, &offs
);
5695 if (GET_CODE (sym
) == SYMBOL_REF
5696 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5697 == SYMBOL_SMALL_ABSOLUTE
))
5699 /* The symbol and offset must be aligned to the access size. */
5702 if (CONSTANT_POOL_ADDRESS_P (sym
))
5703 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5704 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5706 tree exp
= SYMBOL_REF_DECL (sym
);
5707 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5708 align
= aarch64_constant_alignment (exp
, align
);
5710 else if (SYMBOL_REF_DECL (sym
))
5711 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5712 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5713 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5714 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5716 align
= BITS_PER_UNIT
;
5718 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5719 if (known_eq (ref_size
, 0))
5720 ref_size
= GET_MODE_SIZE (DImode
);
5722 return (multiple_p (INTVAL (offs
), ref_size
)
5723 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
5733 /* Return true if the address X is valid for a PRFM instruction.
5734 STRICT_P is true if we should do strict checking with
5735 aarch64_classify_address. */
5738 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
5740 struct aarch64_address_info addr
;
5742 /* PRFM accepts the same addresses as DImode... */
5743 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
5747 /* ... except writeback forms. */
5748 return addr
.type
!= ADDRESS_REG_WB
;
5752 aarch64_symbolic_address_p (rtx x
)
5756 split_const (x
, &x
, &offset
);
5757 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
5760 /* Classify the base of symbolic expression X. */
5762 enum aarch64_symbol_type
5763 aarch64_classify_symbolic_expression (rtx x
)
5767 split_const (x
, &x
, &offset
);
5768 return aarch64_classify_symbol (x
, INTVAL (offset
));
5772 /* Return TRUE if X is a legitimate address for accessing memory in
5775 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
5777 struct aarch64_address_info addr
;
5779 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
5782 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5783 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5785 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
5786 aarch64_addr_query_type type
)
5788 struct aarch64_address_info addr
;
5790 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
5793 /* Split an out-of-range address displacement into a base and offset.
5794 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
5795 to increase opportunities for sharing the base address of different sizes.
5796 Unaligned accesses use the signed 9-bit range, TImode/TFmode use
5797 the intersection of signed scaled 7-bit and signed 9-bit offset. */
5799 aarch64_legitimize_address_displacement (rtx
*disp
, rtx
*off
, machine_mode mode
)
5802 if (GET_MODE_SIZE (mode
).is_constant (&size
))
5804 HOST_WIDE_INT offset
= INTVAL (*disp
);
5807 if (mode
== TImode
|| mode
== TFmode
)
5808 base
= (offset
+ 0x100) & ~0x1f8;
5809 else if ((offset
& (size
- 1)) != 0)
5810 base
= (offset
+ 0x100) & ~0x1ff;
5812 base
= offset
& ~(size
< 4 ? 0xfff : 0x3ffc);
5814 *off
= GEN_INT (base
);
5815 *disp
= GEN_INT (offset
- base
);
5821 /* Return the binary representation of floating point constant VALUE in INTVAL.
5822 If the value cannot be converted, return false without setting INTVAL.
5823 The conversion is done in the given MODE. */
5825 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
5828 /* We make a general exception for 0. */
5829 if (aarch64_float_const_zero_rtx_p (value
))
5835 scalar_float_mode mode
;
5836 if (GET_CODE (value
) != CONST_DOUBLE
5837 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
5838 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
5839 /* Only support up to DF mode. */
5840 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
5843 unsigned HOST_WIDE_INT ival
= 0;
5846 real_to_target (res
,
5847 CONST_DOUBLE_REAL_VALUE (value
),
5848 REAL_MODE_FORMAT (mode
));
5852 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
5853 ival
= zext_hwi (res
[order
], 32);
5854 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
5857 ival
= zext_hwi (res
[0], 32);
5863 /* Return TRUE if rtx X is an immediate constant that can be moved using a
5864 single MOV(+MOVK) followed by an FMOV. */
5866 aarch64_float_const_rtx_p (rtx x
)
5868 machine_mode mode
= GET_MODE (x
);
5869 if (mode
== VOIDmode
)
5872 /* Determine whether it's cheaper to write float constants as
5873 mov/movk pairs over ldr/adrp pairs. */
5874 unsigned HOST_WIDE_INT ival
;
5876 if (GET_CODE (x
) == CONST_DOUBLE
5877 && SCALAR_FLOAT_MODE_P (mode
)
5878 && aarch64_reinterpret_float_as_int (x
, &ival
))
5880 scalar_int_mode imode
= (mode
== HFmode
5882 : int_mode_for_mode (mode
).require ());
5883 int num_instr
= aarch64_internal_mov_immediate
5884 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
5885 return num_instr
< 3;
5891 /* Return TRUE if rtx X is immediate constant 0.0 */
5893 aarch64_float_const_zero_rtx_p (rtx x
)
5895 if (GET_MODE (x
) == VOIDmode
)
5898 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
5899 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
5900 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
5903 /* Return TRUE if rtx X is immediate constant that fits in a single
5904 MOVI immediate operation. */
5906 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
5912 scalar_int_mode imode
;
5913 unsigned HOST_WIDE_INT ival
;
5915 if (GET_CODE (x
) == CONST_DOUBLE
5916 && SCALAR_FLOAT_MODE_P (mode
))
5918 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
5921 /* We make a general exception for 0. */
5922 if (aarch64_float_const_zero_rtx_p (x
))
5925 imode
= int_mode_for_mode (mode
).require ();
5927 else if (GET_CODE (x
) == CONST_INT
5928 && is_a
<scalar_int_mode
> (mode
, &imode
))
5933 /* use a 64 bit mode for everything except for DI/DF mode, where we use
5934 a 128 bit vector mode. */
5935 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
5937 vmode
= aarch64_simd_container_mode (imode
, width
);
5938 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
5940 return aarch64_simd_valid_immediate (v_op
, NULL
);
5944 /* Return the fixed registers used for condition codes. */
5947 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
5950 *p2
= INVALID_REGNUM
;
5954 /* This function is used by the call expanders of the machine description.
5955 RESULT is the register in which the result is returned. It's NULL for
5956 "call" and "sibcall".
5957 MEM is the location of the function call.
5958 SIBCALL indicates whether this function call is normal call or sibling call.
5959 It will generate different pattern accordingly. */
5962 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
5964 rtx call
, callee
, tmp
;
5968 gcc_assert (MEM_P (mem
));
5969 callee
= XEXP (mem
, 0);
5970 mode
= GET_MODE (callee
);
5971 gcc_assert (mode
== Pmode
);
5973 /* Decide if we should generate indirect calls by loading the
5974 address of the callee into a register before performing
5975 the branch-and-link. */
5976 if (SYMBOL_REF_P (callee
)
5977 ? (aarch64_is_long_call_p (callee
)
5978 || aarch64_is_noplt_call_p (callee
))
5980 XEXP (mem
, 0) = force_reg (mode
, callee
);
5982 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
5984 if (result
!= NULL_RTX
)
5985 call
= gen_rtx_SET (result
, call
);
5990 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
5992 vec
= gen_rtvec (2, call
, tmp
);
5993 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
5995 aarch64_emit_call_insn (call
);
5998 /* Emit call insn with PAT and do aarch64-specific handling. */
6001 aarch64_emit_call_insn (rtx pat
)
6003 rtx insn
= emit_call_insn (pat
);
6005 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6006 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6007 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6011 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6013 /* All floating point compares return CCFP if it is an equality
6014 comparison, and CCFPE otherwise. */
6015 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6042 /* Equality comparisons of short modes against zero can be performed
6043 using the TST instruction with the appropriate bitmask. */
6044 if (y
== const0_rtx
&& REG_P (x
)
6045 && (code
== EQ
|| code
== NE
)
6046 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6049 /* Similarly, comparisons of zero_extends from shorter modes can
6050 be performed using an ANDS with an immediate mask. */
6051 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6052 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6053 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6054 && (code
== EQ
|| code
== NE
))
6057 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6059 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6060 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6061 || GET_CODE (x
) == NEG
6062 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6063 && CONST_INT_P (XEXP (x
, 2)))))
6066 /* A compare with a shifted operand. Because of canonicalization,
6067 the comparison will have to be swapped when we emit the assembly
6069 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6070 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6071 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6072 || GET_CODE (x
) == LSHIFTRT
6073 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6076 /* Similarly for a negated operand, but we can only do this for
6078 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6079 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6080 && (code
== EQ
|| code
== NE
)
6081 && GET_CODE (x
) == NEG
)
6084 /* A test for unsigned overflow. */
6085 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6087 && GET_CODE (x
) == PLUS
6088 && GET_CODE (y
) == ZERO_EXTEND
)
6091 /* For everything else, return CCmode. */
6096 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6099 aarch64_get_condition_code (rtx x
)
6101 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6102 enum rtx_code comp_code
= GET_CODE (x
);
6104 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6105 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6106 return aarch64_get_condition_code_1 (mode
, comp_code
);
6110 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6118 case GE
: return AARCH64_GE
;
6119 case GT
: return AARCH64_GT
;
6120 case LE
: return AARCH64_LS
;
6121 case LT
: return AARCH64_MI
;
6122 case NE
: return AARCH64_NE
;
6123 case EQ
: return AARCH64_EQ
;
6124 case ORDERED
: return AARCH64_VC
;
6125 case UNORDERED
: return AARCH64_VS
;
6126 case UNLT
: return AARCH64_LT
;
6127 case UNLE
: return AARCH64_LE
;
6128 case UNGT
: return AARCH64_HI
;
6129 case UNGE
: return AARCH64_PL
;
6137 case NE
: return AARCH64_NE
;
6138 case EQ
: return AARCH64_EQ
;
6139 case GE
: return AARCH64_GE
;
6140 case GT
: return AARCH64_GT
;
6141 case LE
: return AARCH64_LE
;
6142 case LT
: return AARCH64_LT
;
6143 case GEU
: return AARCH64_CS
;
6144 case GTU
: return AARCH64_HI
;
6145 case LEU
: return AARCH64_LS
;
6146 case LTU
: return AARCH64_CC
;
6154 case NE
: return AARCH64_NE
;
6155 case EQ
: return AARCH64_EQ
;
6156 case GE
: return AARCH64_LE
;
6157 case GT
: return AARCH64_LT
;
6158 case LE
: return AARCH64_GE
;
6159 case LT
: return AARCH64_GT
;
6160 case GEU
: return AARCH64_LS
;
6161 case GTU
: return AARCH64_CC
;
6162 case LEU
: return AARCH64_CS
;
6163 case LTU
: return AARCH64_HI
;
6171 case NE
: return AARCH64_NE
;
6172 case EQ
: return AARCH64_EQ
;
6173 case GE
: return AARCH64_PL
;
6174 case LT
: return AARCH64_MI
;
6182 case NE
: return AARCH64_NE
;
6183 case EQ
: return AARCH64_EQ
;
6191 case NE
: return AARCH64_CS
;
6192 case EQ
: return AARCH64_CC
;
6205 aarch64_const_vec_all_same_in_range_p (rtx x
,
6206 HOST_WIDE_INT minval
,
6207 HOST_WIDE_INT maxval
)
6210 return (const_vec_duplicate_p (x
, &elt
)
6211 && CONST_INT_P (elt
)
6212 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6216 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6218 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6221 /* Return true if VEC is a constant in which every element is in the range
6222 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6225 aarch64_const_vec_all_in_range_p (rtx vec
,
6226 HOST_WIDE_INT minval
,
6227 HOST_WIDE_INT maxval
)
6229 if (GET_CODE (vec
) != CONST_VECTOR
6230 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6234 if (!CONST_VECTOR_STEPPED_P (vec
))
6235 nunits
= const_vector_encoded_nelts (vec
);
6236 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6239 for (int i
= 0; i
< nunits
; i
++)
6241 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6242 if (!CONST_INT_P (vec_elem
)
6243 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6250 #define AARCH64_CC_V 1
6251 #define AARCH64_CC_C (1 << 1)
6252 #define AARCH64_CC_Z (1 << 2)
6253 #define AARCH64_CC_N (1 << 3)
6255 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6256 static const int aarch64_nzcv_codes
[] =
6258 0, /* EQ, Z == 1. */
6259 AARCH64_CC_Z
, /* NE, Z == 0. */
6260 0, /* CS, C == 1. */
6261 AARCH64_CC_C
, /* CC, C == 0. */
6262 0, /* MI, N == 1. */
6263 AARCH64_CC_N
, /* PL, N == 0. */
6264 0, /* VS, V == 1. */
6265 AARCH64_CC_V
, /* VC, V == 0. */
6266 0, /* HI, C ==1 && Z == 0. */
6267 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6268 AARCH64_CC_V
, /* GE, N == V. */
6269 0, /* LT, N != V. */
6270 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6271 0, /* LE, !(Z == 0 && N == V). */
6276 /* Print floating-point vector immediate operand X to F, negating it
6277 first if NEGATE is true. Return true on success, false if it isn't
6278 a constant we can handle. */
6281 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6285 if (!const_vec_duplicate_p (x
, &elt
))
6288 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6290 r
= real_value_negate (&r
);
6292 /* We only handle the SVE single-bit immediates here. */
6293 if (real_equal (&r
, &dconst0
))
6294 asm_fprintf (f
, "0.0");
6295 else if (real_equal (&r
, &dconst1
))
6296 asm_fprintf (f
, "1.0");
6297 else if (real_equal (&r
, &dconsthalf
))
6298 asm_fprintf (f
, "0.5");
6305 /* Print operand X to file F in a target specific manner according to CODE.
6306 The acceptable formatting commands given by CODE are:
6307 'c': An integer or symbol address without a preceding #
6309 'C': Take the duplicated element in a vector constant
6310 and print it in hex.
6311 'D': Take the duplicated element in a vector constant
6312 and print it as an unsigned integer, in decimal.
6313 'e': Print the sign/zero-extend size as a character 8->b,
6315 'p': Prints N such that 2^N == X (X must be power of 2 and
6317 'P': Print the number of non-zero bits in X (a const_int).
6318 'H': Print the higher numbered register of a pair (TImode)
6320 'm': Print a condition (eq, ne, etc).
6321 'M': Same as 'm', but invert condition.
6322 'N': Take the duplicated element in a vector constant
6323 and print the negative of it in decimal.
6324 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6325 'S/T/U/V': Print a FP/SIMD register name for a register list.
6326 The register printed is the FP/SIMD register name
6327 of X + 0/1/2/3 for S/T/U/V.
6328 'R': Print a scalar FP/SIMD register name + 1.
6329 'X': Print bottom 16 bits of integer constant in hex.
6330 'w/x': Print a general register name or the zero register
6332 '0': Print a normal operand, if it's a general register,
6333 then we assume DImode.
6334 'k': Print NZCV for conditional compare instructions.
6335 'A': Output address constant representing the first
6336 argument of X, specifying a relocation offset
6338 'L': Output constant address specified by X
6339 with a relocation offset if appropriate.
6340 'G': Prints address of X, specifying a PC relative
6341 relocation mode if appropriate.
6342 'y': Output address of LDP or STP - this is used for
6343 some LDP/STPs which don't use a PARALLEL in their
6344 pattern (so the mode needs to be adjusted).
6345 'z': Output address of a typical LDP or STP. */
6348 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6354 switch (GET_CODE (x
))
6357 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6361 output_addr_const (f
, x
);
6365 if (GET_CODE (XEXP (x
, 0)) == PLUS
6366 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6368 output_addr_const (f
, x
);
6374 output_operand_lossage ("unsupported operand for code '%c'", code
);
6382 if (!CONST_INT_P (x
)
6383 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6385 output_operand_lossage ("invalid operand for '%%%c'", code
);
6401 output_operand_lossage ("invalid operand for '%%%c'", code
);
6411 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6413 output_operand_lossage ("invalid operand for '%%%c'", code
);
6417 asm_fprintf (f
, "%d", n
);
6422 if (!CONST_INT_P (x
))
6424 output_operand_lossage ("invalid operand for '%%%c'", code
);
6428 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6432 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6434 output_operand_lossage ("invalid operand for '%%%c'", code
);
6438 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6445 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6446 if (x
== const_true_rtx
)
6453 if (!COMPARISON_P (x
))
6455 output_operand_lossage ("invalid operand for '%%%c'", code
);
6459 cond_code
= aarch64_get_condition_code (x
);
6460 gcc_assert (cond_code
>= 0);
6462 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6463 fputs (aarch64_condition_codes
[cond_code
], f
);
6468 if (!const_vec_duplicate_p (x
, &elt
))
6470 output_operand_lossage ("invalid vector constant");
6474 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6475 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6476 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6477 && aarch64_print_vector_float_operand (f
, x
, true))
6481 output_operand_lossage ("invalid vector constant");
6491 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6493 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6496 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6503 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6505 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6508 asm_fprintf (f
, "%c%d",
6509 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6510 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6514 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6516 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6519 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6523 if (!CONST_INT_P (x
))
6525 output_operand_lossage ("invalid operand for '%%%c'", code
);
6528 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6533 /* Print a replicated constant in hex. */
6534 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6536 output_operand_lossage ("invalid operand for '%%%c'", code
);
6539 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6540 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6546 /* Print a replicated constant in decimal, treating it as
6548 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6550 output_operand_lossage ("invalid operand for '%%%c'", code
);
6553 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6554 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6561 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6563 asm_fprintf (f
, "%czr", code
);
6567 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6569 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6573 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6575 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6584 output_operand_lossage ("missing operand");
6588 switch (GET_CODE (x
))
6591 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6592 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6594 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6598 output_address (GET_MODE (x
), XEXP (x
, 0));
6603 output_addr_const (asm_out_file
, x
);
6607 asm_fprintf (f
, "%wd", INTVAL (x
));
6611 if (!VECTOR_MODE_P (GET_MODE (x
)))
6613 output_addr_const (asm_out_file
, x
);
6619 if (!const_vec_duplicate_p (x
, &elt
))
6621 output_operand_lossage ("invalid vector constant");
6625 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6626 asm_fprintf (f
, "%wd", INTVAL (elt
));
6627 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6628 && aarch64_print_vector_float_operand (f
, x
, false))
6632 output_operand_lossage ("invalid vector constant");
6638 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6639 be getting CONST_DOUBLEs holding integers. */
6640 gcc_assert (GET_MODE (x
) != VOIDmode
);
6641 if (aarch64_float_const_zero_rtx_p (x
))
6646 else if (aarch64_float_const_representable_p (x
))
6649 char float_buf
[buf_size
] = {'\0'};
6650 real_to_decimal_for_mode (float_buf
,
6651 CONST_DOUBLE_REAL_VALUE (x
),
6654 asm_fprintf (asm_out_file
, "%s", float_buf
);
6658 output_operand_lossage ("invalid constant");
6661 output_operand_lossage ("invalid operand");
6667 if (GET_CODE (x
) == HIGH
)
6670 switch (aarch64_classify_symbolic_expression (x
))
6672 case SYMBOL_SMALL_GOT_4G
:
6673 asm_fprintf (asm_out_file
, ":got:");
6676 case SYMBOL_SMALL_TLSGD
:
6677 asm_fprintf (asm_out_file
, ":tlsgd:");
6680 case SYMBOL_SMALL_TLSDESC
:
6681 asm_fprintf (asm_out_file
, ":tlsdesc:");
6684 case SYMBOL_SMALL_TLSIE
:
6685 asm_fprintf (asm_out_file
, ":gottprel:");
6688 case SYMBOL_TLSLE24
:
6689 asm_fprintf (asm_out_file
, ":tprel:");
6692 case SYMBOL_TINY_GOT
:
6699 output_addr_const (asm_out_file
, x
);
6703 switch (aarch64_classify_symbolic_expression (x
))
6705 case SYMBOL_SMALL_GOT_4G
:
6706 asm_fprintf (asm_out_file
, ":lo12:");
6709 case SYMBOL_SMALL_TLSGD
:
6710 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
6713 case SYMBOL_SMALL_TLSDESC
:
6714 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
6717 case SYMBOL_SMALL_TLSIE
:
6718 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
6721 case SYMBOL_TLSLE12
:
6722 asm_fprintf (asm_out_file
, ":tprel_lo12:");
6725 case SYMBOL_TLSLE24
:
6726 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
6729 case SYMBOL_TINY_GOT
:
6730 asm_fprintf (asm_out_file
, ":got:");
6733 case SYMBOL_TINY_TLSIE
:
6734 asm_fprintf (asm_out_file
, ":gottprel:");
6740 output_addr_const (asm_out_file
, x
);
6744 switch (aarch64_classify_symbolic_expression (x
))
6746 case SYMBOL_TLSLE24
:
6747 asm_fprintf (asm_out_file
, ":tprel_hi12:");
6752 output_addr_const (asm_out_file
, x
);
6757 HOST_WIDE_INT cond_code
;
6759 if (!CONST_INT_P (x
))
6761 output_operand_lossage ("invalid operand for '%%%c'", code
);
6765 cond_code
= INTVAL (x
);
6766 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
6767 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
6774 machine_mode mode
= GET_MODE (x
);
6776 if (GET_CODE (x
) != MEM
6777 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
6779 output_operand_lossage ("invalid operand for '%%%c'", code
);
6784 /* LDP/STP which uses a single double-width memory operand.
6785 Adjust the mode to appear like a typical LDP/STP.
6786 Currently this is supported for 16-byte accesses only. */
6789 if (!aarch64_print_ldpstp_address (f
, mode
, XEXP (x
, 0)))
6790 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
6795 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
6800 /* Print address 'x' of a memory access with mode 'mode'.
6801 'op' is the context required by aarch64_classify_address. It can either be
6802 MEM for a normal memory access or PARALLEL for LDP/STP. */
6804 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
6805 aarch64_addr_query_type type
)
6807 struct aarch64_address_info addr
;
6810 /* Check all addresses are Pmode - including ILP32. */
6811 gcc_assert (GET_MODE (x
) == Pmode
);
6813 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
6816 case ADDRESS_REG_IMM
:
6817 if (known_eq (addr
.const_offset
, 0))
6818 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
6819 else if (aarch64_sve_data_mode_p (mode
))
6822 = exact_div (addr
.const_offset
,
6823 BYTES_PER_SVE_VECTOR
).to_constant ();
6824 asm_fprintf (f
, "[%s, #%wd, mul vl]",
6825 reg_names
[REGNO (addr
.base
)], vnum
);
6827 else if (aarch64_sve_pred_mode_p (mode
))
6830 = exact_div (addr
.const_offset
,
6831 BYTES_PER_SVE_PRED
).to_constant ();
6832 asm_fprintf (f
, "[%s, #%wd, mul vl]",
6833 reg_names
[REGNO (addr
.base
)], vnum
);
6836 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
6837 INTVAL (addr
.offset
));
6840 case ADDRESS_REG_REG
:
6841 if (addr
.shift
== 0)
6842 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
6843 reg_names
[REGNO (addr
.offset
)]);
6845 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
6846 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
6849 case ADDRESS_REG_UXTW
:
6850 if (addr
.shift
== 0)
6851 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
6852 REGNO (addr
.offset
) - R0_REGNUM
);
6854 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
6855 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
6858 case ADDRESS_REG_SXTW
:
6859 if (addr
.shift
== 0)
6860 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
6861 REGNO (addr
.offset
) - R0_REGNUM
);
6863 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
6864 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
6867 case ADDRESS_REG_WB
:
6868 /* Writeback is only supported for fixed-width modes. */
6869 size
= GET_MODE_SIZE (mode
).to_constant ();
6870 switch (GET_CODE (x
))
6873 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
6876 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
6879 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
6882 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
6885 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
6886 INTVAL (addr
.offset
));
6889 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
6890 INTVAL (addr
.offset
));
6897 case ADDRESS_LO_SUM
:
6898 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
6899 output_addr_const (f
, addr
.offset
);
6900 asm_fprintf (f
, "]");
6903 case ADDRESS_SYMBOLIC
:
6904 output_addr_const (f
, x
);
6911 /* Print address 'x' of a LDP/STP with mode 'mode'. */
6913 aarch64_print_ldpstp_address (FILE *f
, machine_mode mode
, rtx x
)
6915 return aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_LDP_STP
);
6918 /* Print address 'x' of a memory access with mode 'mode'. */
6920 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
6922 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
6923 output_addr_const (f
, x
);
6927 aarch64_label_mentioned_p (rtx x
)
6932 if (GET_CODE (x
) == LABEL_REF
)
6935 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
6936 referencing instruction, but they are constant offsets, not
6938 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6941 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
6942 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
6948 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
6949 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
6952 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
6959 /* Implement REGNO_REG_CLASS. */
6962 aarch64_regno_regclass (unsigned regno
)
6964 if (GP_REGNUM_P (regno
))
6965 return GENERAL_REGS
;
6967 if (regno
== SP_REGNUM
)
6970 if (regno
== FRAME_POINTER_REGNUM
6971 || regno
== ARG_POINTER_REGNUM
)
6972 return POINTER_REGS
;
6974 if (FP_REGNUM_P (regno
))
6975 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
6977 if (PR_REGNUM_P (regno
))
6978 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
6983 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
6984 If OFFSET is out of range, return an offset of an anchor point
6985 that is in range. Return 0 otherwise. */
6987 static HOST_WIDE_INT
6988 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
6991 /* Does it look like we'll need a 16-byte load/store-pair operation? */
6993 return (offset
+ 0x400) & ~0x7f0;
6995 /* For offsets that aren't a multiple of the access size, the limit is
6997 if (offset
& (size
- 1))
6999 /* BLKmode typically uses LDP of X-registers. */
7000 if (mode
== BLKmode
)
7001 return (offset
+ 512) & ~0x3ff;
7002 return (offset
+ 0x100) & ~0x1ff;
7005 /* Small negative offsets are supported. */
7006 if (IN_RANGE (offset
, -256, 0))
7009 if (mode
== TImode
|| mode
== TFmode
)
7010 return (offset
+ 0x100) & ~0x1ff;
7012 /* Use 12-bit offset by access size. */
7013 return offset
& (~0xfff * size
);
7017 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7019 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7020 where mask is selected by alignment and size of the offset.
7021 We try to pick as large a range for the offset as possible to
7022 maximize the chance of a CSE. However, for aligned addresses
7023 we limit the range to 4k so that structures with different sized
7024 elements are likely to use the same base. We need to be careful
7025 not to split a CONST for some forms of address expression, otherwise
7026 it will generate sub-optimal code. */
7028 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7030 rtx base
= XEXP (x
, 0);
7031 rtx offset_rtx
= XEXP (x
, 1);
7032 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7034 if (GET_CODE (base
) == PLUS
)
7036 rtx op0
= XEXP (base
, 0);
7037 rtx op1
= XEXP (base
, 1);
7039 /* Force any scaling into a temp for CSE. */
7040 op0
= force_reg (Pmode
, op0
);
7041 op1
= force_reg (Pmode
, op1
);
7043 /* Let the pointer register be in op0. */
7044 if (REG_POINTER (op1
))
7045 std::swap (op0
, op1
);
7047 /* If the pointer is virtual or frame related, then we know that
7048 virtual register instantiation or register elimination is going
7049 to apply a second constant. We want the two constants folded
7050 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7051 if (virt_or_elim_regno_p (REGNO (op0
)))
7053 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7054 NULL_RTX
, true, OPTAB_DIRECT
);
7055 return gen_rtx_PLUS (Pmode
, base
, op1
);
7058 /* Otherwise, in order to encourage CSE (and thence loop strength
7059 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7060 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7061 NULL_RTX
, true, OPTAB_DIRECT
);
7062 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7066 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7068 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7070 if (base_offset
!= 0)
7072 base
= plus_constant (Pmode
, base
, base_offset
);
7073 base
= force_operand (base
, NULL_RTX
);
7074 return plus_constant (Pmode
, base
, offset
- base_offset
);
7082 /* Return the reload icode required for a constant pool in mode. */
7083 static enum insn_code
7084 aarch64_constant_pool_reload_icode (machine_mode mode
)
7089 return CODE_FOR_aarch64_reload_movcpsfdi
;
7092 return CODE_FOR_aarch64_reload_movcpdfdi
;
7095 return CODE_FOR_aarch64_reload_movcptfdi
;
7098 return CODE_FOR_aarch64_reload_movcpv8qidi
;
7101 return CODE_FOR_aarch64_reload_movcpv16qidi
;
7104 return CODE_FOR_aarch64_reload_movcpv4hidi
;
7107 return CODE_FOR_aarch64_reload_movcpv8hidi
;
7110 return CODE_FOR_aarch64_reload_movcpv2sidi
;
7113 return CODE_FOR_aarch64_reload_movcpv4sidi
;
7116 return CODE_FOR_aarch64_reload_movcpv2didi
;
7119 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
7128 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7131 secondary_reload_info
*sri
)
7133 if (BYTES_BIG_ENDIAN
7134 && reg_class_subset_p (rclass
, FP_REGS
)
7135 && (MEM_P (x
) || (REG_P (x
) && !HARD_REGISTER_P (x
)))
7136 && aarch64_sve_data_mode_p (mode
))
7138 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7142 /* If we have to disable direct literal pool loads and stores because the
7143 function is too big, then we need a scratch register. */
7144 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7145 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7146 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7147 && !aarch64_pcrelative_literal_loads
)
7149 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
7153 /* Without the TARGET_SIMD instructions we cannot move a Q register
7154 to a Q register directly. We need a scratch. */
7155 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7156 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7157 && reg_class_subset_p (rclass
, FP_REGS
))
7160 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
7161 else if (mode
== TImode
)
7162 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
7166 /* A TFmode or TImode memory access should be handled via an FP_REGS
7167 because AArch64 has richer addressing modes for LDR/STR instructions
7168 than LDP/STP instructions. */
7169 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7170 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7173 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7174 return GENERAL_REGS
;
7180 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7182 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7184 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7185 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7186 if (frame_pointer_needed
)
7187 return to
== HARD_FRAME_POINTER_REGNUM
;
7192 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7194 aarch64_layout_frame ();
7196 if (to
== HARD_FRAME_POINTER_REGNUM
)
7198 if (from
== ARG_POINTER_REGNUM
)
7199 return cfun
->machine
->frame
.hard_fp_offset
;
7201 if (from
== FRAME_POINTER_REGNUM
)
7202 return cfun
->machine
->frame
.hard_fp_offset
7203 - cfun
->machine
->frame
.locals_offset
;
7206 if (to
== STACK_POINTER_REGNUM
)
7208 if (from
== FRAME_POINTER_REGNUM
)
7209 return cfun
->machine
->frame
.frame_size
7210 - cfun
->machine
->frame
.locals_offset
;
7213 return cfun
->machine
->frame
.frame_size
;
7216 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7220 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7224 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7229 aarch64_asm_trampoline_template (FILE *f
)
7233 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7234 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7238 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7239 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7241 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7242 assemble_aligned_integer (4, const0_rtx
);
7243 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7244 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7248 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7250 rtx fnaddr
, mem
, a_tramp
;
7251 const int tramp_code_sz
= 16;
7253 /* Don't need to copy the trailing D-words, we fill those in below. */
7254 emit_block_move (m_tramp
, assemble_trampoline_template (),
7255 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7256 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7257 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7258 if (GET_MODE (fnaddr
) != ptr_mode
)
7259 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7260 emit_move_insn (mem
, fnaddr
);
7262 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7263 emit_move_insn (mem
, chain_value
);
7265 /* XXX We should really define a "clear_cache" pattern and use
7266 gen_clear_cache(). */
7267 a_tramp
= XEXP (m_tramp
, 0);
7268 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7269 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7270 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7274 static unsigned char
7275 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7277 /* ??? Logically we should only need to provide a value when
7278 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7279 can hold MODE, but at the moment we need to handle all modes.
7280 Just ignore any runtime parts for registers that can't store them. */
7281 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7285 case CALLER_SAVE_REGS
:
7289 case POINTER_AND_FP_REGS
:
7292 if (aarch64_sve_data_mode_p (mode
)
7293 && constant_multiple_p (GET_MODE_SIZE (mode
),
7294 BYTES_PER_SVE_VECTOR
, &nregs
))
7296 return (aarch64_vector_data_mode_p (mode
)
7297 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7298 : CEIL (lowest_size
, UNITS_PER_WORD
));
7315 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7317 if (regclass
== POINTER_REGS
)
7318 return GENERAL_REGS
;
7320 if (regclass
== STACK_REG
)
7323 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7329 /* Register eliminiation can result in a request for
7330 SP+constant->FP_REGS. We cannot support such operations which
7331 use SP as source and an FP_REG as destination, so reject out
7333 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7335 rtx lhs
= XEXP (x
, 0);
7337 /* Look through a possible SUBREG introduced by ILP32. */
7338 if (GET_CODE (lhs
) == SUBREG
)
7339 lhs
= SUBREG_REG (lhs
);
7341 gcc_assert (REG_P (lhs
));
7342 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7351 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7353 asm_fprintf (f
, "%U%s", name
);
7357 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7359 if (priority
== DEFAULT_INIT_PRIORITY
)
7360 default_ctor_section_asm_out_constructor (symbol
, priority
);
7364 /* While priority is known to be in range [0, 65535], so 18 bytes
7365 would be enough, the compiler might not know that. To avoid
7366 -Wformat-truncation false positive, use a larger size. */
7368 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7369 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7370 switch_to_section (s
);
7371 assemble_align (POINTER_SIZE
);
7372 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7377 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7379 if (priority
== DEFAULT_INIT_PRIORITY
)
7380 default_dtor_section_asm_out_destructor (symbol
, priority
);
7384 /* While priority is known to be in range [0, 65535], so 18 bytes
7385 would be enough, the compiler might not know that. To avoid
7386 -Wformat-truncation false positive, use a larger size. */
7388 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7389 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7390 switch_to_section (s
);
7391 assemble_align (POINTER_SIZE
);
7392 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7397 aarch64_output_casesi (rtx
*operands
)
7401 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7403 static const char *const patterns
[4][2] =
7406 "ldrb\t%w3, [%0,%w1,uxtw]",
7407 "add\t%3, %4, %w3, sxtb #2"
7410 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7411 "add\t%3, %4, %w3, sxth #2"
7414 "ldr\t%w3, [%0,%w1,uxtw #2]",
7415 "add\t%3, %4, %w3, sxtw #2"
7417 /* We assume that DImode is only generated when not optimizing and
7418 that we don't really need 64-bit address offsets. That would
7419 imply an object file with 8GB of code in a single function! */
7421 "ldr\t%w3, [%0,%w1,uxtw #2]",
7422 "add\t%3, %4, %w3, sxtw #2"
7426 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7428 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7429 index
= exact_log2 (GET_MODE_SIZE (mode
));
7431 gcc_assert (index
>= 0 && index
<= 3);
7433 /* Need to implement table size reduction, by chaning the code below. */
7434 output_asm_insn (patterns
[index
][0], operands
);
7435 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7436 snprintf (buf
, sizeof (buf
),
7437 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7438 output_asm_insn (buf
, operands
);
7439 output_asm_insn (patterns
[index
][1], operands
);
7440 output_asm_insn ("br\t%3", operands
);
7441 assemble_label (asm_out_file
, label
);
7446 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7447 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7451 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7453 if (shift
>= 0 && shift
<= 3)
7456 for (size
= 8; size
<= 32; size
*= 2)
7458 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7459 if (mask
== bits
<< shift
)
7466 /* Constant pools are per function only when PC relative
7467 literal loads are true or we are in the large memory
7471 aarch64_can_use_per_function_literal_pools_p (void)
7473 return (aarch64_pcrelative_literal_loads
7474 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7478 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7480 /* Fixme:: In an ideal world this would work similar
7481 to the logic in aarch64_select_rtx_section but this
7482 breaks bootstrap in gcc go. For now we workaround
7483 this by returning false here. */
7487 /* Select appropriate section for constants depending
7488 on where we place literal pools. */
7491 aarch64_select_rtx_section (machine_mode mode
,
7493 unsigned HOST_WIDE_INT align
)
7495 if (aarch64_can_use_per_function_literal_pools_p ())
7496 return function_section (current_function_decl
);
7498 return default_elf_select_rtx_section (mode
, x
, align
);
7501 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7503 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7504 HOST_WIDE_INT offset
)
7506 /* When using per-function literal pools, we must ensure that any code
7507 section is aligned to the minimal instruction length, lest we get
7508 errors from the assembler re "unaligned instructions". */
7509 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7510 ASM_OUTPUT_ALIGN (f
, 2);
7515 /* Helper function for rtx cost calculation. Strip a shift expression
7516 from X. Returns the inner operand if successful, or the original
7517 expression on failure. */
7519 aarch64_strip_shift (rtx x
)
7523 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7524 we can convert both to ROR during final output. */
7525 if ((GET_CODE (op
) == ASHIFT
7526 || GET_CODE (op
) == ASHIFTRT
7527 || GET_CODE (op
) == LSHIFTRT
7528 || GET_CODE (op
) == ROTATERT
7529 || GET_CODE (op
) == ROTATE
)
7530 && CONST_INT_P (XEXP (op
, 1)))
7531 return XEXP (op
, 0);
7533 if (GET_CODE (op
) == MULT
7534 && CONST_INT_P (XEXP (op
, 1))
7535 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7536 return XEXP (op
, 0);
7541 /* Helper function for rtx cost calculation. Strip an extend
7542 expression from X. Returns the inner operand if successful, or the
7543 original expression on failure. We deal with a number of possible
7544 canonicalization variations here. If STRIP_SHIFT is true, then
7545 we can strip off a shift also. */
7547 aarch64_strip_extend (rtx x
, bool strip_shift
)
7549 scalar_int_mode mode
;
7552 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7555 /* Zero and sign extraction of a widened value. */
7556 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7557 && XEXP (op
, 2) == const0_rtx
7558 && GET_CODE (XEXP (op
, 0)) == MULT
7559 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7561 return XEXP (XEXP (op
, 0), 0);
7563 /* It can also be represented (for zero-extend) as an AND with an
7565 if (GET_CODE (op
) == AND
7566 && GET_CODE (XEXP (op
, 0)) == MULT
7567 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7568 && CONST_INT_P (XEXP (op
, 1))
7569 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7570 INTVAL (XEXP (op
, 1))) != 0)
7571 return XEXP (XEXP (op
, 0), 0);
7573 /* Now handle extended register, as this may also have an optional
7574 left shift by 1..4. */
7576 && GET_CODE (op
) == ASHIFT
7577 && CONST_INT_P (XEXP (op
, 1))
7578 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7581 if (GET_CODE (op
) == ZERO_EXTEND
7582 || GET_CODE (op
) == SIGN_EXTEND
)
7591 /* Return true iff CODE is a shift supported in combination
7592 with arithmetic instructions. */
7595 aarch64_shift_p (enum rtx_code code
)
7597 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7601 /* Return true iff X is a cheap shift without a sign extend. */
7604 aarch64_cheap_mult_shift_p (rtx x
)
7611 if (!(aarch64_tune_params
.extra_tuning_flags
7612 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7615 if (GET_CODE (op0
) == SIGN_EXTEND
)
7618 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7619 && UINTVAL (op1
) <= 4)
7622 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7625 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7627 if (l2
> 0 && l2
<= 4)
7633 /* Helper function for rtx cost calculation. Calculate the cost of
7634 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7635 Return the calculated cost of the expression, recursing manually in to
7636 operands where needed. */
7639 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7642 const struct cpu_cost_table
*extra_cost
7643 = aarch64_tune_params
.insn_extra_cost
;
7645 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7646 machine_mode mode
= GET_MODE (x
);
7648 gcc_checking_assert (code
== MULT
);
7653 if (VECTOR_MODE_P (mode
))
7654 mode
= GET_MODE_INNER (mode
);
7656 /* Integer multiply/fma. */
7657 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7659 /* The multiply will be canonicalized as a shift, cost it as such. */
7660 if (aarch64_shift_p (GET_CODE (x
))
7661 || (CONST_INT_P (op1
)
7662 && exact_log2 (INTVAL (op1
)) > 0))
7664 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
7665 || GET_CODE (op0
) == SIGN_EXTEND
;
7670 /* If the shift is considered cheap,
7671 then don't add any cost. */
7672 if (aarch64_cheap_mult_shift_p (x
))
7674 else if (REG_P (op1
))
7675 /* ARITH + shift-by-register. */
7676 cost
+= extra_cost
->alu
.arith_shift_reg
;
7678 /* ARITH + extended register. We don't have a cost field
7679 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7680 cost
+= extra_cost
->alu
.extend_arith
;
7682 /* ARITH + shift-by-immediate. */
7683 cost
+= extra_cost
->alu
.arith_shift
;
7686 /* LSL (immediate). */
7687 cost
+= extra_cost
->alu
.shift
;
7690 /* Strip extends as we will have costed them in the case above. */
7692 op0
= aarch64_strip_extend (op0
, true);
7694 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
7699 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7700 compound and let the below cases handle it. After all, MNEG is a
7701 special-case alias of MSUB. */
7702 if (GET_CODE (op0
) == NEG
)
7704 op0
= XEXP (op0
, 0);
7708 /* Integer multiplies or FMAs have zero/sign extending variants. */
7709 if ((GET_CODE (op0
) == ZERO_EXTEND
7710 && GET_CODE (op1
) == ZERO_EXTEND
)
7711 || (GET_CODE (op0
) == SIGN_EXTEND
7712 && GET_CODE (op1
) == SIGN_EXTEND
))
7714 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
7715 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
7720 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7721 cost
+= extra_cost
->mult
[0].extend_add
;
7723 /* MUL/SMULL/UMULL. */
7724 cost
+= extra_cost
->mult
[0].extend
;
7730 /* This is either an integer multiply or a MADD. In both cases
7731 we want to recurse and cost the operands. */
7732 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
7733 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
7739 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
7742 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
7751 /* Floating-point FMA/FMUL can also support negations of the
7752 operands, unless the rounding mode is upward or downward in
7753 which case FNMUL is different than FMUL with operand negation. */
7754 bool neg0
= GET_CODE (op0
) == NEG
;
7755 bool neg1
= GET_CODE (op1
) == NEG
;
7756 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
7759 op0
= XEXP (op0
, 0);
7761 op1
= XEXP (op1
, 0);
7765 /* FMADD/FNMADD/FNMSUB/FMSUB. */
7766 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
7769 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
7772 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
7773 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
7779 aarch64_address_cost (rtx x
,
7781 addr_space_t as ATTRIBUTE_UNUSED
,
7784 enum rtx_code c
= GET_CODE (x
);
7785 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
7786 struct aarch64_address_info info
;
7790 if (!aarch64_classify_address (&info
, x
, mode
, false))
7792 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
7794 /* This is a CONST or SYMBOL ref which will be split
7795 in a different way depending on the code model in use.
7796 Cost it through the generic infrastructure. */
7797 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
7798 /* Divide through by the cost of one instruction to
7799 bring it to the same units as the address costs. */
7800 cost_symbol_ref
/= COSTS_N_INSNS (1);
7801 /* The cost is then the cost of preparing the address,
7802 followed by an immediate (possibly 0) offset. */
7803 return cost_symbol_ref
+ addr_cost
->imm_offset
;
7807 /* This is most likely a jump table from a case
7809 return addr_cost
->register_offset
;
7815 case ADDRESS_LO_SUM
:
7816 case ADDRESS_SYMBOLIC
:
7817 case ADDRESS_REG_IMM
:
7818 cost
+= addr_cost
->imm_offset
;
7821 case ADDRESS_REG_WB
:
7822 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
7823 cost
+= addr_cost
->pre_modify
;
7824 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
7825 cost
+= addr_cost
->post_modify
;
7831 case ADDRESS_REG_REG
:
7832 cost
+= addr_cost
->register_offset
;
7835 case ADDRESS_REG_SXTW
:
7836 cost
+= addr_cost
->register_sextend
;
7839 case ADDRESS_REG_UXTW
:
7840 cost
+= addr_cost
->register_zextend
;
7850 /* For the sake of calculating the cost of the shifted register
7851 component, we can treat same sized modes in the same way. */
7852 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
7853 cost
+= addr_cost
->addr_scale_costs
.hi
;
7854 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
7855 cost
+= addr_cost
->addr_scale_costs
.si
;
7856 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
7857 cost
+= addr_cost
->addr_scale_costs
.di
;
7859 /* We can't tell, or this is a 128-bit vector. */
7860 cost
+= addr_cost
->addr_scale_costs
.ti
;
7866 /* Return the cost of a branch. If SPEED_P is true then the compiler is
7867 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
7871 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
7873 /* When optimizing for speed, use the cost of unpredictable branches. */
7874 const struct cpu_branch_cost
*branch_costs
=
7875 aarch64_tune_params
.branch_costs
;
7877 if (!speed_p
|| predictable_p
)
7878 return branch_costs
->predictable
;
7880 return branch_costs
->unpredictable
;
7883 /* Return true if the RTX X in mode MODE is a zero or sign extract
7884 usable in an ADD or SUB (extended register) instruction. */
7886 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
7888 /* Catch add with a sign extract.
7889 This is add_<optab><mode>_multp2. */
7890 if (GET_CODE (x
) == SIGN_EXTRACT
7891 || GET_CODE (x
) == ZERO_EXTRACT
)
7893 rtx op0
= XEXP (x
, 0);
7894 rtx op1
= XEXP (x
, 1);
7895 rtx op2
= XEXP (x
, 2);
7897 if (GET_CODE (op0
) == MULT
7898 && CONST_INT_P (op1
)
7899 && op2
== const0_rtx
7900 && CONST_INT_P (XEXP (op0
, 1))
7901 && aarch64_is_extend_from_extract (mode
,
7908 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
7910 else if (GET_CODE (x
) == SIGN_EXTEND
7911 || GET_CODE (x
) == ZERO_EXTEND
)
7912 return REG_P (XEXP (x
, 0));
7918 aarch64_frint_unspec_p (unsigned int u
)
7936 /* Return true iff X is an rtx that will match an extr instruction
7937 i.e. as described in the *extr<mode>5_insn family of patterns.
7938 OP0 and OP1 will be set to the operands of the shifts involved
7939 on success and will be NULL_RTX otherwise. */
7942 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
7945 scalar_int_mode mode
;
7946 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
7949 *res_op0
= NULL_RTX
;
7950 *res_op1
= NULL_RTX
;
7952 if (GET_CODE (x
) != IOR
)
7958 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
7959 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
7961 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
7962 if (GET_CODE (op1
) == ASHIFT
)
7963 std::swap (op0
, op1
);
7965 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
7968 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
7969 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
7971 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
7972 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
7974 *res_op0
= XEXP (op0
, 0);
7975 *res_op1
= XEXP (op1
, 0);
7983 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
7984 storing it in *COST. Result is true if the total cost of the operation
7985 has now been calculated. */
7987 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
7991 enum rtx_code cmpcode
;
7993 if (COMPARISON_P (op0
))
7995 inner
= XEXP (op0
, 0);
7996 comparator
= XEXP (op0
, 1);
7997 cmpcode
= GET_CODE (op0
);
8002 comparator
= const0_rtx
;
8006 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8008 /* Conditional branch. */
8009 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8013 if (cmpcode
== NE
|| cmpcode
== EQ
)
8015 if (comparator
== const0_rtx
)
8017 /* TBZ/TBNZ/CBZ/CBNZ. */
8018 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8020 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8021 ZERO_EXTRACT
, 0, speed
);
8024 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8029 else if (cmpcode
== LT
|| cmpcode
== GE
)
8032 if (comparator
== const0_rtx
)
8037 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8040 if (GET_CODE (op1
) == COMPARE
)
8042 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8043 if (XEXP (op1
, 1) == const0_rtx
)
8047 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8048 const struct cpu_cost_table
*extra_cost
8049 = aarch64_tune_params
.insn_extra_cost
;
8051 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8052 *cost
+= extra_cost
->alu
.arith
;
8054 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8059 /* It's a conditional operation based on the status flags,
8060 so it must be some flavor of CSEL. */
8062 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8063 if (GET_CODE (op1
) == NEG
8064 || GET_CODE (op1
) == NOT
8065 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8066 op1
= XEXP (op1
, 0);
8067 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8069 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8070 op1
= XEXP (op1
, 0);
8071 op2
= XEXP (op2
, 0);
8074 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8075 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8079 /* We don't know what this is, cost all operands. */
8083 /* Check whether X is a bitfield operation of the form shift + extend that
8084 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8085 operand to which the bitfield operation is applied. Otherwise return
8089 aarch64_extend_bitfield_pattern_p (rtx x
)
8091 rtx_code outer_code
= GET_CODE (x
);
8092 machine_mode outer_mode
= GET_MODE (x
);
8094 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8095 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8098 rtx inner
= XEXP (x
, 0);
8099 rtx_code inner_code
= GET_CODE (inner
);
8100 machine_mode inner_mode
= GET_MODE (inner
);
8106 if (CONST_INT_P (XEXP (inner
, 1))
8107 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8108 op
= XEXP (inner
, 0);
8111 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8112 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8113 op
= XEXP (inner
, 0);
8116 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8117 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8118 op
= XEXP (inner
, 0);
8127 /* Return true if the mask and a shift amount from an RTX of the form
8128 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8129 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8132 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8135 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8136 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8137 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8138 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8141 /* Calculate the cost of calculating X, storing it in *COST. Result
8142 is true if the total cost of the operation has now been calculated. */
8144 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8145 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8148 const struct cpu_cost_table
*extra_cost
8149 = aarch64_tune_params
.insn_extra_cost
;
8150 int code
= GET_CODE (x
);
8151 scalar_int_mode int_mode
;
8153 /* By default, assume that everything has equivalent cost to the
8154 cheapest instruction. Any additional costs are applied as a delta
8155 above this default. */
8156 *cost
= COSTS_N_INSNS (1);
8161 /* The cost depends entirely on the operands to SET. */
8166 switch (GET_CODE (op0
))
8171 rtx address
= XEXP (op0
, 0);
8172 if (VECTOR_MODE_P (mode
))
8173 *cost
+= extra_cost
->ldst
.storev
;
8174 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8175 *cost
+= extra_cost
->ldst
.store
;
8176 else if (mode
== SFmode
)
8177 *cost
+= extra_cost
->ldst
.storef
;
8178 else if (mode
== DFmode
)
8179 *cost
+= extra_cost
->ldst
.stored
;
8182 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8186 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8190 if (! REG_P (SUBREG_REG (op0
)))
8191 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8195 /* The cost is one per vector-register copied. */
8196 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8198 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8199 *cost
= COSTS_N_INSNS (nregs
);
8201 /* const0_rtx is in general free, but we will use an
8202 instruction to set a register to 0. */
8203 else if (REG_P (op1
) || op1
== const0_rtx
)
8205 /* The cost is 1 per register copied. */
8206 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8207 *cost
= COSTS_N_INSNS (nregs
);
8210 /* Cost is just the cost of the RHS of the set. */
8211 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8216 /* Bit-field insertion. Strip any redundant widening of
8217 the RHS to meet the width of the target. */
8218 if (GET_CODE (op1
) == SUBREG
)
8219 op1
= SUBREG_REG (op1
);
8220 if ((GET_CODE (op1
) == ZERO_EXTEND
8221 || GET_CODE (op1
) == SIGN_EXTEND
)
8222 && CONST_INT_P (XEXP (op0
, 1))
8223 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8224 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8225 op1
= XEXP (op1
, 0);
8227 if (CONST_INT_P (op1
))
8229 /* MOV immediate is assumed to always be cheap. */
8230 *cost
= COSTS_N_INSNS (1);
8236 *cost
+= extra_cost
->alu
.bfi
;
8237 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8243 /* We can't make sense of this, assume default cost. */
8244 *cost
= COSTS_N_INSNS (1);
8250 /* If an instruction can incorporate a constant within the
8251 instruction, the instruction's expression avoids calling
8252 rtx_cost() on the constant. If rtx_cost() is called on a
8253 constant, then it is usually because the constant must be
8254 moved into a register by one or more instructions.
8256 The exception is constant 0, which can be expressed
8257 as XZR/WZR and is therefore free. The exception to this is
8258 if we have (set (reg) (const0_rtx)) in which case we must cost
8259 the move. However, we can catch that when we cost the SET, so
8260 we don't need to consider that here. */
8261 if (x
== const0_rtx
)
8265 /* To an approximation, building any other constant is
8266 proportionally expensive to the number of instructions
8267 required to build that constant. This is true whether we
8268 are compiling for SPEED or otherwise. */
8269 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8270 int_mode
= word_mode
;
8271 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8272 (NULL_RTX
, x
, false, int_mode
));
8278 /* First determine number of instructions to do the move
8279 as an integer constant. */
8280 if (!aarch64_float_const_representable_p (x
)
8281 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8282 && aarch64_float_const_rtx_p (x
))
8284 unsigned HOST_WIDE_INT ival
;
8285 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8286 gcc_assert (succeed
);
8288 scalar_int_mode imode
= (mode
== HFmode
8290 : int_mode_for_mode (mode
).require ());
8291 int ncost
= aarch64_internal_mov_immediate
8292 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8293 *cost
+= COSTS_N_INSNS (ncost
);
8299 /* mov[df,sf]_aarch64. */
8300 if (aarch64_float_const_representable_p (x
))
8301 /* FMOV (scalar immediate). */
8302 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8303 else if (!aarch64_float_const_zero_rtx_p (x
))
8305 /* This will be a load from memory. */
8307 *cost
+= extra_cost
->ldst
.loadd
;
8309 *cost
+= extra_cost
->ldst
.loadf
;
8312 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8313 or MOV v0.s[0], wzr - neither of which are modeled by the
8314 cost tables. Just use the default cost. */
8324 /* For loads we want the base cost of a load, plus an
8325 approximation for the additional cost of the addressing
8327 rtx address
= XEXP (x
, 0);
8328 if (VECTOR_MODE_P (mode
))
8329 *cost
+= extra_cost
->ldst
.loadv
;
8330 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8331 *cost
+= extra_cost
->ldst
.load
;
8332 else if (mode
== SFmode
)
8333 *cost
+= extra_cost
->ldst
.loadf
;
8334 else if (mode
== DFmode
)
8335 *cost
+= extra_cost
->ldst
.loadd
;
8338 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8347 if (VECTOR_MODE_P (mode
))
8352 *cost
+= extra_cost
->vect
.alu
;
8357 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8359 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8360 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8363 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8367 /* Cost this as SUB wzr, X. */
8368 op0
= CONST0_RTX (mode
);
8373 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8375 /* Support (neg(fma...)) as a single instruction only if
8376 sign of zeros is unimportant. This matches the decision
8377 making in aarch64.md. */
8378 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8381 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8384 if (GET_CODE (op0
) == MULT
)
8387 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8392 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8402 if (VECTOR_MODE_P (mode
))
8403 *cost
+= extra_cost
->vect
.alu
;
8405 *cost
+= extra_cost
->alu
.clz
;
8414 if (op1
== const0_rtx
8415 && GET_CODE (op0
) == AND
)
8418 mode
= GET_MODE (op0
);
8422 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8424 /* TODO: A write to the CC flags possibly costs extra, this
8425 needs encoding in the cost tables. */
8427 mode
= GET_MODE (op0
);
8429 if (GET_CODE (op0
) == AND
)
8435 if (GET_CODE (op0
) == PLUS
)
8437 /* ADDS (and CMN alias). */
8442 if (GET_CODE (op0
) == MINUS
)
8449 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8450 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8451 && CONST_INT_P (XEXP (op0
, 2)))
8453 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8454 Handle it here directly rather than going to cost_logic
8455 since we know the immediate generated for the TST is valid
8456 so we can avoid creating an intermediate rtx for it only
8457 for costing purposes. */
8459 *cost
+= extra_cost
->alu
.logical
;
8461 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8462 ZERO_EXTRACT
, 0, speed
);
8466 if (GET_CODE (op1
) == NEG
)
8470 *cost
+= extra_cost
->alu
.arith
;
8472 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8473 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8479 Compare can freely swap the order of operands, and
8480 canonicalization puts the more complex operation first.
8481 But the integer MINUS logic expects the shift/extend
8482 operation in op1. */
8484 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8492 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8496 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8498 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8500 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8501 /* FCMP supports constant 0.0 for no extra cost. */
8507 if (VECTOR_MODE_P (mode
))
8509 /* Vector compare. */
8511 *cost
+= extra_cost
->vect
.alu
;
8513 if (aarch64_float_const_zero_rtx_p (op1
))
8515 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8529 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8531 /* Detect valid immediates. */
8532 if ((GET_MODE_CLASS (mode
) == MODE_INT
8533 || (GET_MODE_CLASS (mode
) == MODE_CC
8534 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8535 && CONST_INT_P (op1
)
8536 && aarch64_uimm12_shift (INTVAL (op1
)))
8539 /* SUB(S) (immediate). */
8540 *cost
+= extra_cost
->alu
.arith
;
8544 /* Look for SUB (extended register). */
8545 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8546 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8549 *cost
+= extra_cost
->alu
.extend_arith
;
8551 op1
= aarch64_strip_extend (op1
, true);
8552 *cost
+= rtx_cost (op1
, VOIDmode
,
8553 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8557 rtx new_op1
= aarch64_strip_extend (op1
, false);
8559 /* Cost this as an FMA-alike operation. */
8560 if ((GET_CODE (new_op1
) == MULT
8561 || aarch64_shift_p (GET_CODE (new_op1
)))
8564 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8565 (enum rtx_code
) code
,
8570 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8574 if (VECTOR_MODE_P (mode
))
8577 *cost
+= extra_cost
->vect
.alu
;
8579 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8582 *cost
+= extra_cost
->alu
.arith
;
8584 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8587 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8601 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8602 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8605 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8606 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8610 if (GET_MODE_CLASS (mode
) == MODE_INT
8611 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8612 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8614 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8617 /* ADD (immediate). */
8618 *cost
+= extra_cost
->alu
.arith
;
8622 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8624 /* Look for ADD (extended register). */
8625 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8626 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8629 *cost
+= extra_cost
->alu
.extend_arith
;
8631 op0
= aarch64_strip_extend (op0
, true);
8632 *cost
+= rtx_cost (op0
, VOIDmode
,
8633 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8637 /* Strip any extend, leave shifts behind as we will
8638 cost them through mult_cost. */
8639 new_op0
= aarch64_strip_extend (op0
, false);
8641 if (GET_CODE (new_op0
) == MULT
8642 || aarch64_shift_p (GET_CODE (new_op0
)))
8644 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8649 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8653 if (VECTOR_MODE_P (mode
))
8656 *cost
+= extra_cost
->vect
.alu
;
8658 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8661 *cost
+= extra_cost
->alu
.arith
;
8663 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8666 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8673 *cost
= COSTS_N_INSNS (1);
8677 if (VECTOR_MODE_P (mode
))
8678 *cost
+= extra_cost
->vect
.alu
;
8680 *cost
+= extra_cost
->alu
.rev
;
8685 if (aarch_rev16_p (x
))
8687 *cost
= COSTS_N_INSNS (1);
8691 if (VECTOR_MODE_P (mode
))
8692 *cost
+= extra_cost
->vect
.alu
;
8694 *cost
+= extra_cost
->alu
.rev
;
8699 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
8701 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
8702 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
8704 *cost
+= extra_cost
->alu
.shift
;
8715 if (VECTOR_MODE_P (mode
))
8718 *cost
+= extra_cost
->vect
.alu
;
8723 && GET_CODE (op0
) == MULT
8724 && CONST_INT_P (XEXP (op0
, 1))
8725 && CONST_INT_P (op1
)
8726 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
8729 /* This is a UBFM/SBFM. */
8730 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
8732 *cost
+= extra_cost
->alu
.bfx
;
8736 if (is_int_mode (mode
, &int_mode
))
8738 if (CONST_INT_P (op1
))
8740 /* We have a mask + shift version of a UBFIZ
8741 i.e. the *andim_ashift<mode>_bfiz pattern. */
8742 if (GET_CODE (op0
) == ASHIFT
8743 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
8746 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
8747 (enum rtx_code
) code
, 0, speed
);
8749 *cost
+= extra_cost
->alu
.bfx
;
8753 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
8755 /* We possibly get the immediate for free, this is not
8757 *cost
+= rtx_cost (op0
, int_mode
,
8758 (enum rtx_code
) code
, 0, speed
);
8760 *cost
+= extra_cost
->alu
.logical
;
8769 /* Handle ORN, EON, or BIC. */
8770 if (GET_CODE (op0
) == NOT
)
8771 op0
= XEXP (op0
, 0);
8773 new_op0
= aarch64_strip_shift (op0
);
8775 /* If we had a shift on op0 then this is a logical-shift-
8776 by-register/immediate operation. Otherwise, this is just
8777 a logical operation. */
8782 /* Shift by immediate. */
8783 if (CONST_INT_P (XEXP (op0
, 1)))
8784 *cost
+= extra_cost
->alu
.log_shift
;
8786 *cost
+= extra_cost
->alu
.log_shift_reg
;
8789 *cost
+= extra_cost
->alu
.logical
;
8792 /* In both cases we want to cost both operands. */
8793 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
8795 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
8805 op0
= aarch64_strip_shift (x
);
8807 if (VECTOR_MODE_P (mode
))
8810 *cost
+= extra_cost
->vect
.alu
;
8814 /* MVN-shifted-reg. */
8817 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
8820 *cost
+= extra_cost
->alu
.log_shift
;
8824 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
8825 Handle the second form here taking care that 'a' in the above can
8827 else if (GET_CODE (op0
) == XOR
)
8829 rtx newop0
= XEXP (op0
, 0);
8830 rtx newop1
= XEXP (op0
, 1);
8831 rtx op0_stripped
= aarch64_strip_shift (newop0
);
8833 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
8834 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
8838 if (op0_stripped
!= newop0
)
8839 *cost
+= extra_cost
->alu
.log_shift
;
8841 *cost
+= extra_cost
->alu
.logical
;
8848 *cost
+= extra_cost
->alu
.logical
;
8855 /* If a value is written in SI mode, then zero extended to DI
8856 mode, the operation will in general be free as a write to
8857 a 'w' register implicitly zeroes the upper bits of an 'x'
8858 register. However, if this is
8860 (set (reg) (zero_extend (reg)))
8862 we must cost the explicit register move. */
8864 && GET_MODE (op0
) == SImode
8867 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
8869 /* If OP_COST is non-zero, then the cost of the zero extend
8870 is effectively the cost of the inner operation. Otherwise
8871 we have a MOV instruction and we take the cost from the MOV
8872 itself. This is true independently of whether we are
8873 optimizing for space or time. */
8879 else if (MEM_P (op0
))
8881 /* All loads can zero extend to any size for free. */
8882 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
8886 op0
= aarch64_extend_bitfield_pattern_p (x
);
8889 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
8891 *cost
+= extra_cost
->alu
.bfx
;
8897 if (VECTOR_MODE_P (mode
))
8900 *cost
+= extra_cost
->vect
.alu
;
8904 /* We generate an AND instead of UXTB/UXTH. */
8905 *cost
+= extra_cost
->alu
.logical
;
8911 if (MEM_P (XEXP (x
, 0)))
8916 rtx address
= XEXP (XEXP (x
, 0), 0);
8917 *cost
+= extra_cost
->ldst
.load_sign_extend
;
8920 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8926 op0
= aarch64_extend_bitfield_pattern_p (x
);
8929 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
8931 *cost
+= extra_cost
->alu
.bfx
;
8937 if (VECTOR_MODE_P (mode
))
8938 *cost
+= extra_cost
->vect
.alu
;
8940 *cost
+= extra_cost
->alu
.extend
;
8948 if (CONST_INT_P (op1
))
8952 if (VECTOR_MODE_P (mode
))
8954 /* Vector shift (immediate). */
8955 *cost
+= extra_cost
->vect
.alu
;
8959 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
8961 *cost
+= extra_cost
->alu
.shift
;
8965 /* We can incorporate zero/sign extend for free. */
8966 if (GET_CODE (op0
) == ZERO_EXTEND
8967 || GET_CODE (op0
) == SIGN_EXTEND
)
8968 op0
= XEXP (op0
, 0);
8970 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
8975 if (VECTOR_MODE_P (mode
))
8978 /* Vector shift (register). */
8979 *cost
+= extra_cost
->vect
.alu
;
8985 *cost
+= extra_cost
->alu
.shift_reg
;
8987 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
8988 && CONST_INT_P (XEXP (op1
, 1))
8989 && known_eq (INTVAL (XEXP (op1
, 1)),
8990 GET_MODE_BITSIZE (mode
) - 1))
8992 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
8993 /* We already demanded XEXP (op1, 0) to be REG_P, so
8994 don't recurse into it. */
8998 return false; /* All arguments need to be in registers. */
9008 if (CONST_INT_P (op1
))
9010 /* ASR (immediate) and friends. */
9013 if (VECTOR_MODE_P (mode
))
9014 *cost
+= extra_cost
->vect
.alu
;
9016 *cost
+= extra_cost
->alu
.shift
;
9019 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9024 if (VECTOR_MODE_P (mode
))
9027 /* Vector shift (register). */
9028 *cost
+= extra_cost
->vect
.alu
;
9033 /* ASR (register) and friends. */
9034 *cost
+= extra_cost
->alu
.shift_reg
;
9036 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9037 && CONST_INT_P (XEXP (op1
, 1))
9038 && known_eq (INTVAL (XEXP (op1
, 1)),
9039 GET_MODE_BITSIZE (mode
) - 1))
9041 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9042 /* We already demanded XEXP (op1, 0) to be REG_P, so
9043 don't recurse into it. */
9047 return false; /* All arguments need to be in registers. */
9052 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9053 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9057 *cost
+= extra_cost
->ldst
.load
;
9059 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9060 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9062 /* ADRP, followed by ADD. */
9063 *cost
+= COSTS_N_INSNS (1);
9065 *cost
+= 2 * extra_cost
->alu
.arith
;
9067 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9068 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9072 *cost
+= extra_cost
->alu
.arith
;
9077 /* One extra load instruction, after accessing the GOT. */
9078 *cost
+= COSTS_N_INSNS (1);
9080 *cost
+= extra_cost
->ldst
.load
;
9086 /* ADRP/ADD (immediate). */
9088 *cost
+= extra_cost
->alu
.arith
;
9096 if (VECTOR_MODE_P (mode
))
9097 *cost
+= extra_cost
->vect
.alu
;
9099 *cost
+= extra_cost
->alu
.bfx
;
9102 /* We can trust that the immediates used will be correct (there
9103 are no by-register forms), so we need only cost op0. */
9104 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9108 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9109 /* aarch64_rtx_mult_cost always handles recursion to its
9114 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9115 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9116 an unconditional negate. This case should only ever be reached through
9117 the set_smod_pow2_cheap check in expmed.c. */
9118 if (CONST_INT_P (XEXP (x
, 1))
9119 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9120 && (mode
== SImode
|| mode
== DImode
))
9122 /* We expand to 4 instructions. Reset the baseline. */
9123 *cost
= COSTS_N_INSNS (4);
9126 *cost
+= 2 * extra_cost
->alu
.logical
9127 + 2 * extra_cost
->alu
.arith
;
9136 /* Slighly prefer UMOD over SMOD. */
9137 if (VECTOR_MODE_P (mode
))
9138 *cost
+= extra_cost
->vect
.alu
;
9139 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9140 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9141 + extra_cost
->mult
[mode
== DImode
].idiv
9142 + (code
== MOD
? 1 : 0));
9144 return false; /* All arguments need to be in registers. */
9151 if (VECTOR_MODE_P (mode
))
9152 *cost
+= extra_cost
->vect
.alu
;
9153 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9154 /* There is no integer SQRT, so only DIV and UDIV can get
9156 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9157 /* Slighly prefer UDIV over SDIV. */
9158 + (code
== DIV
? 1 : 0));
9160 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9162 return false; /* All arguments need to be in registers. */
9165 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9166 XEXP (x
, 2), cost
, speed
);
9179 return false; /* All arguments must be in registers. */
9188 if (VECTOR_MODE_P (mode
))
9189 *cost
+= extra_cost
->vect
.alu
;
9191 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9194 /* FMSUB, FNMADD, and FNMSUB are free. */
9195 if (GET_CODE (op0
) == NEG
)
9196 op0
= XEXP (op0
, 0);
9198 if (GET_CODE (op2
) == NEG
)
9199 op2
= XEXP (op2
, 0);
9201 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9202 and the by-element operand as operand 0. */
9203 if (GET_CODE (op1
) == NEG
)
9204 op1
= XEXP (op1
, 0);
9206 /* Catch vector-by-element operations. The by-element operand can
9207 either be (vec_duplicate (vec_select (x))) or just
9208 (vec_select (x)), depending on whether we are multiplying by
9209 a vector or a scalar.
9211 Canonicalization is not very good in these cases, FMA4 will put the
9212 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9213 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9214 op0
= XEXP (op0
, 0);
9215 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9216 op1
= XEXP (op1
, 0);
9218 if (GET_CODE (op0
) == VEC_SELECT
)
9219 op0
= XEXP (op0
, 0);
9220 else if (GET_CODE (op1
) == VEC_SELECT
)
9221 op1
= XEXP (op1
, 0);
9223 /* If the remaining parameters are not registers,
9224 get the cost to put them into registers. */
9225 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9226 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9227 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9231 case UNSIGNED_FLOAT
:
9233 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9239 if (VECTOR_MODE_P (mode
))
9241 /*Vector truncate. */
9242 *cost
+= extra_cost
->vect
.alu
;
9245 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9249 case FLOAT_TRUNCATE
:
9252 if (VECTOR_MODE_P (mode
))
9254 /*Vector conversion. */
9255 *cost
+= extra_cost
->vect
.alu
;
9258 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9265 /* Strip the rounding part. They will all be implemented
9266 by the fcvt* family of instructions anyway. */
9267 if (GET_CODE (x
) == UNSPEC
)
9269 unsigned int uns_code
= XINT (x
, 1);
9271 if (uns_code
== UNSPEC_FRINTA
9272 || uns_code
== UNSPEC_FRINTM
9273 || uns_code
== UNSPEC_FRINTN
9274 || uns_code
== UNSPEC_FRINTP
9275 || uns_code
== UNSPEC_FRINTZ
)
9276 x
= XVECEXP (x
, 0, 0);
9281 if (VECTOR_MODE_P (mode
))
9282 *cost
+= extra_cost
->vect
.alu
;
9284 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9287 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9288 fixed-point fcvt. */
9289 if (GET_CODE (x
) == MULT
9290 && ((VECTOR_MODE_P (mode
)
9291 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9292 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9294 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9299 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9303 if (VECTOR_MODE_P (mode
))
9307 *cost
+= extra_cost
->vect
.alu
;
9309 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9313 /* FABD, which is analogous to FADD. */
9314 if (GET_CODE (op0
) == MINUS
)
9316 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9317 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9319 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9323 /* Simple FABS is analogous to FNEG. */
9325 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9329 /* Integer ABS will either be split to
9330 two arithmetic instructions, or will be an ABS
9331 (scalar), which we don't model. */
9332 *cost
= COSTS_N_INSNS (2);
9334 *cost
+= 2 * extra_cost
->alu
.arith
;
9342 if (VECTOR_MODE_P (mode
))
9343 *cost
+= extra_cost
->vect
.alu
;
9346 /* FMAXNM/FMINNM/FMAX/FMIN.
9347 TODO: This may not be accurate for all implementations, but
9348 we do not model this in the cost tables. */
9349 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9355 /* The floating point round to integer frint* instructions. */
9356 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9359 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9364 if (XINT (x
, 1) == UNSPEC_RBIT
)
9367 *cost
+= extra_cost
->alu
.rev
;
9375 /* Decompose <su>muldi3_highpart. */
9376 if (/* (truncate:DI */
9379 && GET_MODE (XEXP (x
, 0)) == TImode
9380 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9382 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9383 /* (ANY_EXTEND:TI (reg:DI))
9384 (ANY_EXTEND:TI (reg:DI))) */
9385 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9386 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9387 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9388 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9389 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9390 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9391 /* (const_int 64) */
9392 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9393 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9397 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9398 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9399 mode
, MULT
, 0, speed
);
9400 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9401 mode
, MULT
, 1, speed
);
9411 && flag_aarch64_verbose_cost
)
9413 "\nFailed to cost RTX. Assuming default cost.\n");
9418 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9419 calculated for X. This cost is stored in *COST. Returns true
9420 if the total cost of X was calculated. */
9422 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9423 int param
, int *cost
, bool speed
)
9425 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9428 && flag_aarch64_verbose_cost
)
9430 print_rtl_single (dump_file
, x
);
9431 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9432 speed
? "Hot" : "Cold",
9433 *cost
, result
? "final" : "partial");
9440 aarch64_register_move_cost (machine_mode mode
,
9441 reg_class_t from_i
, reg_class_t to_i
)
9443 enum reg_class from
= (enum reg_class
) from_i
;
9444 enum reg_class to
= (enum reg_class
) to_i
;
9445 const struct cpu_regmove_cost
*regmove_cost
9446 = aarch64_tune_params
.regmove_cost
;
9448 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9449 if (to
== CALLER_SAVE_REGS
|| to
== POINTER_REGS
)
9452 if (from
== CALLER_SAVE_REGS
|| from
== POINTER_REGS
)
9453 from
= GENERAL_REGS
;
9455 /* Moving between GPR and stack cost is the same as GP2GP. */
9456 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9457 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9458 return regmove_cost
->GP2GP
;
9460 /* To/From the stack register, we move via the gprs. */
9461 if (to
== STACK_REG
|| from
== STACK_REG
)
9462 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9463 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9465 if (known_eq (GET_MODE_SIZE (mode
), 16))
9467 /* 128-bit operations on general registers require 2 instructions. */
9468 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9469 return regmove_cost
->GP2GP
* 2;
9470 else if (from
== GENERAL_REGS
)
9471 return regmove_cost
->GP2FP
* 2;
9472 else if (to
== GENERAL_REGS
)
9473 return regmove_cost
->FP2GP
* 2;
9475 /* When AdvSIMD instructions are disabled it is not possible to move
9476 a 128-bit value directly between Q registers. This is handled in
9477 secondary reload. A general register is used as a scratch to move
9478 the upper DI value and the lower DI value is moved directly,
9479 hence the cost is the sum of three moves. */
9481 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9483 return regmove_cost
->FP2FP
;
9486 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9487 return regmove_cost
->GP2GP
;
9488 else if (from
== GENERAL_REGS
)
9489 return regmove_cost
->GP2FP
;
9490 else if (to
== GENERAL_REGS
)
9491 return regmove_cost
->FP2GP
;
9493 return regmove_cost
->FP2FP
;
9497 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9498 reg_class_t rclass ATTRIBUTE_UNUSED
,
9499 bool in ATTRIBUTE_UNUSED
)
9501 return aarch64_tune_params
.memmov_cost
;
9504 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9505 to optimize 1.0/sqrt. */
9508 use_rsqrt_p (machine_mode mode
)
9510 return (!flag_trapping_math
9511 && flag_unsafe_math_optimizations
9512 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9513 & AARCH64_APPROX_MODE (mode
))
9514 || flag_mrecip_low_precision_sqrt
));
9517 /* Function to decide when to use the approximate reciprocal square root
9521 aarch64_builtin_reciprocal (tree fndecl
)
9523 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9525 if (!use_rsqrt_p (mode
))
9527 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9530 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
9532 /* Select reciprocal square root initial estimate insn depending on machine
9536 get_rsqrte_type (machine_mode mode
)
9540 case E_DFmode
: return gen_aarch64_rsqrtedf
;
9541 case E_SFmode
: return gen_aarch64_rsqrtesf
;
9542 case E_V2DFmode
: return gen_aarch64_rsqrtev2df
;
9543 case E_V2SFmode
: return gen_aarch64_rsqrtev2sf
;
9544 case E_V4SFmode
: return gen_aarch64_rsqrtev4sf
;
9545 default: gcc_unreachable ();
9549 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
9551 /* Select reciprocal square root series step insn depending on machine mode. */
9554 get_rsqrts_type (machine_mode mode
)
9558 case E_DFmode
: return gen_aarch64_rsqrtsdf
;
9559 case E_SFmode
: return gen_aarch64_rsqrtssf
;
9560 case E_V2DFmode
: return gen_aarch64_rsqrtsv2df
;
9561 case E_V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
9562 case E_V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
9563 default: gcc_unreachable ();
9567 /* Emit instruction sequence to compute either the approximate square root
9568 or its approximate reciprocal, depending on the flag RECP, and return
9569 whether the sequence was emitted or not. */
9572 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9574 machine_mode mode
= GET_MODE (dst
);
9576 if (GET_MODE_INNER (mode
) == HFmode
)
9584 if (!(flag_mlow_precision_sqrt
9585 || (aarch64_tune_params
.approx_modes
->sqrt
9586 & AARCH64_APPROX_MODE (mode
))))
9589 if (flag_finite_math_only
9590 || flag_trapping_math
9591 || !flag_unsafe_math_optimizations
9592 || optimize_function_for_size_p (cfun
))
9596 /* Caller assumes we cannot fail. */
9597 gcc_assert (use_rsqrt_p (mode
));
9599 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9600 rtx xmsk
= gen_reg_rtx (mmsk
);
9602 /* When calculating the approximate square root, compare the
9603 argument with 0.0 and create a mask. */
9604 emit_insn (gen_rtx_SET (xmsk
,
9606 gen_rtx_EQ (mmsk
, src
,
9607 CONST0_RTX (mode
)))));
9609 /* Estimate the approximate reciprocal square root. */
9610 rtx xdst
= gen_reg_rtx (mode
);
9611 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
9613 /* Iterate over the series twice for SF and thrice for DF. */
9614 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9616 /* Optionally iterate over the series once less for faster performance
9617 while sacrificing the accuracy. */
9618 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9619 || (!recp
&& flag_mlow_precision_sqrt
))
9622 /* Iterate over the series to calculate the approximate reciprocal square
9624 rtx x1
= gen_reg_rtx (mode
);
9625 while (iterations
--)
9627 rtx x2
= gen_reg_rtx (mode
);
9628 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9630 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
9633 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9638 /* Qualify the approximate reciprocal square root when the argument is
9639 0.0 by squashing the intermediary result to 0.0. */
9640 rtx xtmp
= gen_reg_rtx (mmsk
);
9641 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9642 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9643 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9645 /* Calculate the approximate square root. */
9646 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9649 /* Finalize the approximation. */
9650 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
9655 typedef rtx (*recpe_type
) (rtx
, rtx
);
9657 /* Select reciprocal initial estimate insn depending on machine mode. */
9660 get_recpe_type (machine_mode mode
)
9664 case E_SFmode
: return (gen_aarch64_frecpesf
);
9665 case E_V2SFmode
: return (gen_aarch64_frecpev2sf
);
9666 case E_V4SFmode
: return (gen_aarch64_frecpev4sf
);
9667 case E_DFmode
: return (gen_aarch64_frecpedf
);
9668 case E_V2DFmode
: return (gen_aarch64_frecpev2df
);
9669 default: gcc_unreachable ();
9673 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
9675 /* Select reciprocal series step insn depending on machine mode. */
9678 get_recps_type (machine_mode mode
)
9682 case E_SFmode
: return (gen_aarch64_frecpssf
);
9683 case E_V2SFmode
: return (gen_aarch64_frecpsv2sf
);
9684 case E_V4SFmode
: return (gen_aarch64_frecpsv4sf
);
9685 case E_DFmode
: return (gen_aarch64_frecpsdf
);
9686 case E_V2DFmode
: return (gen_aarch64_frecpsv2df
);
9687 default: gcc_unreachable ();
9691 /* Emit the instruction sequence to compute the approximation for the division
9692 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9695 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
9697 machine_mode mode
= GET_MODE (quo
);
9699 if (GET_MODE_INNER (mode
) == HFmode
)
9702 bool use_approx_division_p
= (flag_mlow_precision_div
9703 || (aarch64_tune_params
.approx_modes
->division
9704 & AARCH64_APPROX_MODE (mode
)));
9706 if (!flag_finite_math_only
9707 || flag_trapping_math
9708 || !flag_unsafe_math_optimizations
9709 || optimize_function_for_size_p (cfun
)
9710 || !use_approx_division_p
)
9713 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
9716 /* Estimate the approximate reciprocal. */
9717 rtx xrcp
= gen_reg_rtx (mode
);
9718 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
9720 /* Iterate over the series twice for SF and thrice for DF. */
9721 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9723 /* Optionally iterate over the series once less for faster performance,
9724 while sacrificing the accuracy. */
9725 if (flag_mlow_precision_div
)
9728 /* Iterate over the series to calculate the approximate reciprocal. */
9729 rtx xtmp
= gen_reg_rtx (mode
);
9730 while (iterations
--)
9732 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
9735 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
9738 if (num
!= CONST1_RTX (mode
))
9740 /* As the approximate reciprocal of DEN is already calculated, only
9741 calculate the approximate division when NUM is not 1.0. */
9742 rtx xnum
= force_reg (mode
, num
);
9743 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
9746 /* Finalize the approximation. */
9747 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
9751 /* Return the number of instructions that can be issued per cycle. */
9753 aarch64_sched_issue_rate (void)
9755 return aarch64_tune_params
.issue_rate
;
9759 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9761 int issue_rate
= aarch64_sched_issue_rate ();
9763 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
9767 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9768 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9769 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9772 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
9775 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
9779 /* Vectorizer cost model target hooks. */
9781 /* Implement targetm.vectorize.builtin_vectorization_cost. */
9783 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
9785 int misalign ATTRIBUTE_UNUSED
)
9788 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
9791 if (vectype
!= NULL
)
9792 fp
= FLOAT_TYPE_P (vectype
);
9794 switch (type_of_cost
)
9797 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
9800 return costs
->scalar_load_cost
;
9803 return costs
->scalar_store_cost
;
9806 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
9809 return costs
->vec_align_load_cost
;
9812 return costs
->vec_store_cost
;
9815 return costs
->vec_to_scalar_cost
;
9818 return costs
->scalar_to_vec_cost
;
9820 case unaligned_load
:
9821 case vector_gather_load
:
9822 return costs
->vec_unalign_load_cost
;
9824 case unaligned_store
:
9825 case vector_scatter_store
:
9826 return costs
->vec_unalign_store_cost
;
9828 case cond_branch_taken
:
9829 return costs
->cond_taken_branch_cost
;
9831 case cond_branch_not_taken
:
9832 return costs
->cond_not_taken_branch_cost
;
9835 return costs
->vec_permute_cost
;
9837 case vec_promote_demote
:
9838 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
9841 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
9842 return elements
/ 2 + 1;
9849 /* Implement targetm.vectorize.add_stmt_cost. */
9851 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
9852 struct _stmt_vec_info
*stmt_info
, int misalign
,
9853 enum vect_cost_model_location where
)
9855 unsigned *cost
= (unsigned *) data
;
9856 unsigned retval
= 0;
9858 if (flag_vect_cost_model
)
9860 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
9862 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
9864 /* Statements in an inner loop relative to the loop being
9865 vectorized are weighted more heavily. The value here is
9866 arbitrary and could potentially be improved with analysis. */
9867 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
9868 count
*= 50; /* FIXME */
9870 retval
= (unsigned) (count
* stmt_cost
);
9871 cost
[where
] += retval
;
9877 static void initialize_aarch64_code_model (struct gcc_options
*);
9879 /* Parse the TO_PARSE string and put the architecture struct that it
9880 selects into RES and the architectural features into ISA_FLAGS.
9881 Return an aarch64_parse_opt_result describing the parse result.
9882 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
9884 static enum aarch64_parse_opt_result
9885 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
9886 unsigned long *isa_flags
)
9889 const struct processor
*arch
;
9890 char *str
= (char *) alloca (strlen (to_parse
) + 1);
9893 strcpy (str
, to_parse
);
9895 ext
= strchr (str
, '+');
9903 return AARCH64_PARSE_MISSING_ARG
;
9906 /* Loop through the list of supported ARCHes to find a match. */
9907 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
9909 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
9911 unsigned long isa_temp
= arch
->flags
;
9915 /* TO_PARSE string contains at least one extension. */
9916 enum aarch64_parse_opt_result ext_res
9917 = aarch64_parse_extension (ext
, &isa_temp
);
9919 if (ext_res
!= AARCH64_PARSE_OK
)
9922 /* Extension parsing was successful. Confirm the result
9923 arch and ISA flags. */
9925 *isa_flags
= isa_temp
;
9926 return AARCH64_PARSE_OK
;
9930 /* ARCH name not found in list. */
9931 return AARCH64_PARSE_INVALID_ARG
;
9934 /* Parse the TO_PARSE string and put the result tuning in RES and the
9935 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
9936 describing the parse result. If there is an error parsing, RES and
9937 ISA_FLAGS are left unchanged. */
9939 static enum aarch64_parse_opt_result
9940 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
9941 unsigned long *isa_flags
)
9944 const struct processor
*cpu
;
9945 char *str
= (char *) alloca (strlen (to_parse
) + 1);
9948 strcpy (str
, to_parse
);
9950 ext
= strchr (str
, '+');
9958 return AARCH64_PARSE_MISSING_ARG
;
9961 /* Loop through the list of supported CPUs to find a match. */
9962 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
9964 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
9966 unsigned long isa_temp
= cpu
->flags
;
9971 /* TO_PARSE string contains at least one extension. */
9972 enum aarch64_parse_opt_result ext_res
9973 = aarch64_parse_extension (ext
, &isa_temp
);
9975 if (ext_res
!= AARCH64_PARSE_OK
)
9978 /* Extension parsing was successfull. Confirm the result
9979 cpu and ISA flags. */
9981 *isa_flags
= isa_temp
;
9982 return AARCH64_PARSE_OK
;
9986 /* CPU name not found in list. */
9987 return AARCH64_PARSE_INVALID_ARG
;
9990 /* Parse the TO_PARSE string and put the cpu it selects into RES.
9991 Return an aarch64_parse_opt_result describing the parse result.
9992 If the parsing fails the RES does not change. */
9994 static enum aarch64_parse_opt_result
9995 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
9997 const struct processor
*cpu
;
9998 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10000 strcpy (str
, to_parse
);
10002 /* Loop through the list of supported CPUs to find a match. */
10003 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10005 if (strcmp (cpu
->name
, str
) == 0)
10008 return AARCH64_PARSE_OK
;
10012 /* CPU name not found in list. */
10013 return AARCH64_PARSE_INVALID_ARG
;
10016 /* Parse TOKEN, which has length LENGTH to see if it is an option
10017 described in FLAG. If it is, return the index bit for that fusion type.
10018 If not, error (printing OPTION_NAME) and return zero. */
10020 static unsigned int
10021 aarch64_parse_one_option_token (const char *token
,
10023 const struct aarch64_flag_desc
*flag
,
10024 const char *option_name
)
10026 for (; flag
->name
!= NULL
; flag
++)
10028 if (length
== strlen (flag
->name
)
10029 && !strncmp (flag
->name
, token
, length
))
10033 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10037 /* Parse OPTION which is a comma-separated list of flags to enable.
10038 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10039 default state we inherit from the CPU tuning structures. OPTION_NAME
10040 gives the top-level option we are parsing in the -moverride string,
10041 for use in error messages. */
10043 static unsigned int
10044 aarch64_parse_boolean_options (const char *option
,
10045 const struct aarch64_flag_desc
*flags
,
10046 unsigned int initial_state
,
10047 const char *option_name
)
10049 const char separator
= '.';
10050 const char* specs
= option
;
10051 const char* ntoken
= option
;
10052 unsigned int found_flags
= initial_state
;
10054 while ((ntoken
= strchr (specs
, separator
)))
10056 size_t token_length
= ntoken
- specs
;
10057 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10061 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10062 in the token stream, reset the supported operations. So:
10064 adrp+add.cmp+branch.none.adrp+add
10066 would have the result of turning on only adrp+add fusion. */
10070 found_flags
|= token_ops
;
10074 /* We ended with a comma, print something. */
10077 error ("%s string ill-formed\n", option_name
);
10081 /* We still have one more token to parse. */
10082 size_t token_length
= strlen (specs
);
10083 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10090 found_flags
|= token_ops
;
10091 return found_flags
;
10094 /* Support for overriding instruction fusion. */
10097 aarch64_parse_fuse_string (const char *fuse_string
,
10098 struct tune_params
*tune
)
10100 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10101 aarch64_fusible_pairs
,
10106 /* Support for overriding other tuning flags. */
10109 aarch64_parse_tune_string (const char *tune_string
,
10110 struct tune_params
*tune
)
10112 tune
->extra_tuning_flags
10113 = aarch64_parse_boolean_options (tune_string
,
10114 aarch64_tuning_flags
,
10115 tune
->extra_tuning_flags
,
10119 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10120 we understand. If it is, extract the option string and handoff to
10121 the appropriate function. */
10124 aarch64_parse_one_override_token (const char* token
,
10126 struct tune_params
*tune
)
10128 const struct aarch64_tuning_override_function
*fn
10129 = aarch64_tuning_override_functions
;
10131 const char *option_part
= strchr (token
, '=');
10134 error ("tuning string missing in option (%s)", token
);
10138 /* Get the length of the option name. */
10139 length
= option_part
- token
;
10140 /* Skip the '=' to get to the option string. */
10143 for (; fn
->name
!= NULL
; fn
++)
10145 if (!strncmp (fn
->name
, token
, length
))
10147 fn
->parse_override (option_part
, tune
);
10152 error ("unknown tuning option (%s)",token
);
10156 /* A checking mechanism for the implementation of the tls size. */
10159 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10161 if (aarch64_tls_size
== 0)
10162 aarch64_tls_size
= 24;
10164 switch (opts
->x_aarch64_cmodel_var
)
10166 case AARCH64_CMODEL_TINY
:
10167 /* Both the default and maximum TLS size allowed under tiny is 1M which
10168 needs two instructions to address, so we clamp the size to 24. */
10169 if (aarch64_tls_size
> 24)
10170 aarch64_tls_size
= 24;
10172 case AARCH64_CMODEL_SMALL
:
10173 /* The maximum TLS size allowed under small is 4G. */
10174 if (aarch64_tls_size
> 32)
10175 aarch64_tls_size
= 32;
10177 case AARCH64_CMODEL_LARGE
:
10178 /* The maximum TLS size allowed under large is 16E.
10179 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10180 if (aarch64_tls_size
> 48)
10181 aarch64_tls_size
= 48;
10184 gcc_unreachable ();
10190 /* Parse STRING looking for options in the format:
10191 string :: option:string
10192 option :: name=substring
10194 substring :: defined by option. */
10197 aarch64_parse_override_string (const char* input_string
,
10198 struct tune_params
* tune
)
10200 const char separator
= ':';
10201 size_t string_length
= strlen (input_string
) + 1;
10202 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10203 char *string
= string_root
;
10204 strncpy (string
, input_string
, string_length
);
10205 string
[string_length
- 1] = '\0';
10207 char* ntoken
= string
;
10209 while ((ntoken
= strchr (string
, separator
)))
10211 size_t token_length
= ntoken
- string
;
10212 /* Make this substring look like a string. */
10214 aarch64_parse_one_override_token (string
, token_length
, tune
);
10218 /* One last option to parse. */
10219 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10220 free (string_root
);
10225 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10227 /* PR 70044: We have to be careful about being called multiple times for the
10228 same function. This means all changes should be repeatable. */
10230 /* If the frame pointer is enabled, set it to a special value that behaves
10231 similar to frame pointer omission. If we don't do this all leaf functions
10232 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10233 If flag_omit_frame_pointer has this special value, we must force the
10234 frame pointer if not in a leaf function. We also need to force it in a
10235 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10236 if (opts
->x_flag_omit_frame_pointer
== 0)
10237 opts
->x_flag_omit_frame_pointer
= 2;
10239 /* If not optimizing for size, set the default
10240 alignment to what the target wants. */
10241 if (!opts
->x_optimize_size
)
10243 if (opts
->x_align_loops
<= 0)
10244 opts
->x_align_loops
= aarch64_tune_params
.loop_align
;
10245 if (opts
->x_align_jumps
<= 0)
10246 opts
->x_align_jumps
= aarch64_tune_params
.jump_align
;
10247 if (opts
->x_align_functions
<= 0)
10248 opts
->x_align_functions
= aarch64_tune_params
.function_align
;
10251 /* We default to no pc-relative literal loads. */
10253 aarch64_pcrelative_literal_loads
= false;
10255 /* If -mpc-relative-literal-loads is set on the command line, this
10256 implies that the user asked for PC relative literal loads. */
10257 if (opts
->x_pcrelative_literal_loads
== 1)
10258 aarch64_pcrelative_literal_loads
= true;
10260 /* In the tiny memory model it makes no sense to disallow PC relative
10261 literal pool loads. */
10262 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10263 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10264 aarch64_pcrelative_literal_loads
= true;
10266 /* When enabling the lower precision Newton series for the square root, also
10267 enable it for the reciprocal square root, since the latter is an
10268 intermediary step for the former. */
10269 if (flag_mlow_precision_sqrt
)
10270 flag_mrecip_low_precision_sqrt
= true;
10273 /* 'Unpack' up the internal tuning structs and update the options
10274 in OPTS. The caller must have set up selected_tune and selected_arch
10275 as all the other target-specific codegen decisions are
10276 derived from them. */
10279 aarch64_override_options_internal (struct gcc_options
*opts
)
10281 aarch64_tune_flags
= selected_tune
->flags
;
10282 aarch64_tune
= selected_tune
->sched_core
;
10283 /* Make a copy of the tuning parameters attached to the core, which
10284 we may later overwrite. */
10285 aarch64_tune_params
= *(selected_tune
->tune
);
10286 aarch64_architecture_version
= selected_arch
->architecture_version
;
10288 if (opts
->x_aarch64_override_tune_string
)
10289 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10290 &aarch64_tune_params
);
10292 /* This target defaults to strict volatile bitfields. */
10293 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10294 opts
->x_flag_strict_volatile_bitfields
= 1;
10296 initialize_aarch64_code_model (opts
);
10297 initialize_aarch64_tls_size (opts
);
10299 int queue_depth
= 0;
10300 switch (aarch64_tune_params
.autoprefetcher_model
)
10302 case tune_params::AUTOPREFETCHER_OFF
:
10305 case tune_params::AUTOPREFETCHER_WEAK
:
10308 case tune_params::AUTOPREFETCHER_STRONG
:
10309 queue_depth
= max_insn_queue_index
+ 1;
10312 gcc_unreachable ();
10315 /* We don't mind passing in global_options_set here as we don't use
10316 the *options_set structs anyway. */
10317 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10319 opts
->x_param_values
,
10320 global_options_set
.x_param_values
);
10322 /* Set up parameters to be used in prefetching algorithm. Do not
10323 override the defaults unless we are tuning for a core we have
10324 researched values for. */
10325 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10326 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10327 aarch64_tune_params
.prefetch
->num_slots
,
10328 opts
->x_param_values
,
10329 global_options_set
.x_param_values
);
10330 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10331 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10332 aarch64_tune_params
.prefetch
->l1_cache_size
,
10333 opts
->x_param_values
,
10334 global_options_set
.x_param_values
);
10335 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10336 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10337 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10338 opts
->x_param_values
,
10339 global_options_set
.x_param_values
);
10340 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10341 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10342 aarch64_tune_params
.prefetch
->l2_cache_size
,
10343 opts
->x_param_values
,
10344 global_options_set
.x_param_values
);
10346 /* Use the alternative scheduling-pressure algorithm by default. */
10347 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10348 opts
->x_param_values
,
10349 global_options_set
.x_param_values
);
10351 /* Enable sw prefetching at specified optimization level for
10352 CPUS that have prefetch. Lower optimization level threshold by 1
10353 when profiling is enabled. */
10354 if (opts
->x_flag_prefetch_loop_arrays
< 0
10355 && !opts
->x_optimize_size
10356 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10357 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10358 opts
->x_flag_prefetch_loop_arrays
= 1;
10360 aarch64_override_options_after_change_1 (opts
);
10363 /* Print a hint with a suggestion for a core or architecture name that
10364 most closely resembles what the user passed in STR. ARCH is true if
10365 the user is asking for an architecture name. ARCH is false if the user
10366 is asking for a core name. */
10369 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10371 auto_vec
<const char *> candidates
;
10372 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10373 for (; entry
->name
!= NULL
; entry
++)
10374 candidates
.safe_push (entry
->name
);
10376 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10378 inform (input_location
, "valid arguments are: %s;"
10379 " did you mean %qs?", s
, hint
);
10383 /* Print a hint with a suggestion for a core name that most closely resembles
10384 what the user passed in STR. */
10387 aarch64_print_hint_for_core (const char *str
)
10389 aarch64_print_hint_for_core_or_arch (str
, false);
10392 /* Print a hint with a suggestion for an architecture name that most closely
10393 resembles what the user passed in STR. */
10396 aarch64_print_hint_for_arch (const char *str
)
10398 aarch64_print_hint_for_core_or_arch (str
, true);
10401 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10402 specified in STR and throw errors if appropriate. Put the results if
10403 they are valid in RES and ISA_FLAGS. Return whether the option is
10407 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10408 unsigned long *isa_flags
)
10410 enum aarch64_parse_opt_result parse_res
10411 = aarch64_parse_cpu (str
, res
, isa_flags
);
10413 if (parse_res
== AARCH64_PARSE_OK
)
10418 case AARCH64_PARSE_MISSING_ARG
:
10419 error ("missing cpu name in %<-mcpu=%s%>", str
);
10421 case AARCH64_PARSE_INVALID_ARG
:
10422 error ("unknown value %qs for -mcpu", str
);
10423 aarch64_print_hint_for_core (str
);
10425 case AARCH64_PARSE_INVALID_FEATURE
:
10426 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10429 gcc_unreachable ();
10435 /* Validate a command-line -march option. Parse the arch and extensions
10436 (if any) specified in STR and throw errors if appropriate. Put the
10437 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10438 option is valid. */
10441 aarch64_validate_march (const char *str
, const struct processor
**res
,
10442 unsigned long *isa_flags
)
10444 enum aarch64_parse_opt_result parse_res
10445 = aarch64_parse_arch (str
, res
, isa_flags
);
10447 if (parse_res
== AARCH64_PARSE_OK
)
10452 case AARCH64_PARSE_MISSING_ARG
:
10453 error ("missing arch name in %<-march=%s%>", str
);
10455 case AARCH64_PARSE_INVALID_ARG
:
10456 error ("unknown value %qs for -march", str
);
10457 aarch64_print_hint_for_arch (str
);
10459 case AARCH64_PARSE_INVALID_FEATURE
:
10460 error ("invalid feature modifier in %<-march=%s%>", str
);
10463 gcc_unreachable ();
10469 /* Validate a command-line -mtune option. Parse the cpu
10470 specified in STR and throw errors if appropriate. Put the
10471 result, if it is valid, in RES. Return whether the option is
10475 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10477 enum aarch64_parse_opt_result parse_res
10478 = aarch64_parse_tune (str
, res
);
10480 if (parse_res
== AARCH64_PARSE_OK
)
10485 case AARCH64_PARSE_MISSING_ARG
:
10486 error ("missing cpu name in %<-mtune=%s%>", str
);
10488 case AARCH64_PARSE_INVALID_ARG
:
10489 error ("unknown value %qs for -mtune", str
);
10490 aarch64_print_hint_for_core (str
);
10493 gcc_unreachable ();
10498 /* Return the CPU corresponding to the enum CPU.
10499 If it doesn't specify a cpu, return the default. */
10501 static const struct processor
*
10502 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10504 if (cpu
!= aarch64_none
)
10505 return &all_cores
[cpu
];
10507 /* The & 0x3f is to extract the bottom 6 bits that encode the
10508 default cpu as selected by the --with-cpu GCC configure option
10510 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10511 flags mechanism should be reworked to make it more sane. */
10512 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10515 /* Return the architecture corresponding to the enum ARCH.
10516 If it doesn't specify a valid architecture, return the default. */
10518 static const struct processor
*
10519 aarch64_get_arch (enum aarch64_arch arch
)
10521 if (arch
!= aarch64_no_arch
)
10522 return &all_architectures
[arch
];
10524 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10526 return &all_architectures
[cpu
->arch
];
10529 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10532 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10534 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10535 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10536 deciding which .md file patterns to use and when deciding whether
10537 something is a legitimate address or constant. */
10538 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10539 return poly_uint16 (2, 2);
10541 return (int) value
/ 64;
10544 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10545 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10546 tuning structs. In particular it must set selected_tune and
10547 aarch64_isa_flags that define the available ISA features and tuning
10548 decisions. It must also set selected_arch as this will be used to
10549 output the .arch asm tags for each function. */
10552 aarch64_override_options (void)
10554 unsigned long cpu_isa
= 0;
10555 unsigned long arch_isa
= 0;
10556 aarch64_isa_flags
= 0;
10558 bool valid_cpu
= true;
10559 bool valid_tune
= true;
10560 bool valid_arch
= true;
10562 selected_cpu
= NULL
;
10563 selected_arch
= NULL
;
10564 selected_tune
= NULL
;
10566 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10567 If either of -march or -mtune is given, they override their
10568 respective component of -mcpu. */
10569 if (aarch64_cpu_string
)
10570 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10573 if (aarch64_arch_string
)
10574 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10577 if (aarch64_tune_string
)
10578 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10580 /* If the user did not specify a processor, choose the default
10581 one for them. This will be the CPU set during configuration using
10582 --with-cpu, otherwise it is "generic". */
10587 selected_cpu
= &all_cores
[selected_arch
->ident
];
10588 aarch64_isa_flags
= arch_isa
;
10589 explicit_arch
= selected_arch
->arch
;
10593 /* Get default configure-time CPU. */
10594 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10595 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10599 explicit_tune_core
= selected_tune
->ident
;
10601 /* If both -mcpu and -march are specified check that they are architecturally
10602 compatible, warn if they're not and prefer the -march ISA flags. */
10603 else if (selected_arch
)
10605 if (selected_arch
->arch
!= selected_cpu
->arch
)
10607 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10608 all_architectures
[selected_cpu
->arch
].name
,
10609 selected_arch
->name
);
10611 aarch64_isa_flags
= arch_isa
;
10612 explicit_arch
= selected_arch
->arch
;
10613 explicit_tune_core
= selected_tune
? selected_tune
->ident
10614 : selected_cpu
->ident
;
10618 /* -mcpu but no -march. */
10619 aarch64_isa_flags
= cpu_isa
;
10620 explicit_tune_core
= selected_tune
? selected_tune
->ident
10621 : selected_cpu
->ident
;
10622 gcc_assert (selected_cpu
);
10623 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10624 explicit_arch
= selected_arch
->arch
;
10627 /* Set the arch as well as we will need it when outputing
10628 the .arch directive in assembly. */
10629 if (!selected_arch
)
10631 gcc_assert (selected_cpu
);
10632 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10635 if (!selected_tune
)
10636 selected_tune
= selected_cpu
;
10638 #ifndef HAVE_AS_MABI_OPTION
10639 /* The compiler may have been configured with 2.23.* binutils, which does
10640 not have support for ILP32. */
10642 error ("assembler does not support -mabi=ilp32");
10645 /* Convert -msve-vector-bits to a VG count. */
10646 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
10648 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
10649 sorry ("return address signing is only supported for -mabi=lp64");
10651 /* Make sure we properly set up the explicit options. */
10652 if ((aarch64_cpu_string
&& valid_cpu
)
10653 || (aarch64_tune_string
&& valid_tune
))
10654 gcc_assert (explicit_tune_core
!= aarch64_none
);
10656 if ((aarch64_cpu_string
&& valid_cpu
)
10657 || (aarch64_arch_string
&& valid_arch
))
10658 gcc_assert (explicit_arch
!= aarch64_no_arch
);
10660 aarch64_override_options_internal (&global_options
);
10662 /* Save these options as the default ones in case we push and pop them later
10663 while processing functions with potential target attributes. */
10664 target_option_default_node
= target_option_current_node
10665 = build_target_option_node (&global_options
);
10668 /* Implement targetm.override_options_after_change. */
10671 aarch64_override_options_after_change (void)
10673 aarch64_override_options_after_change_1 (&global_options
);
10676 static struct machine_function
*
10677 aarch64_init_machine_status (void)
10679 struct machine_function
*machine
;
10680 machine
= ggc_cleared_alloc
<machine_function
> ();
10685 aarch64_init_expanders (void)
10687 init_machine_status
= aarch64_init_machine_status
;
10690 /* A checking mechanism for the implementation of the various code models. */
10692 initialize_aarch64_code_model (struct gcc_options
*opts
)
10694 if (opts
->x_flag_pic
)
10696 switch (opts
->x_aarch64_cmodel_var
)
10698 case AARCH64_CMODEL_TINY
:
10699 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
10701 case AARCH64_CMODEL_SMALL
:
10702 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10703 aarch64_cmodel
= (flag_pic
== 2
10704 ? AARCH64_CMODEL_SMALL_PIC
10705 : AARCH64_CMODEL_SMALL_SPIC
);
10707 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
10710 case AARCH64_CMODEL_LARGE
:
10711 sorry ("code model %qs with -f%s", "large",
10712 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
10715 gcc_unreachable ();
10719 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
10722 /* Implement TARGET_OPTION_SAVE. */
10725 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
10727 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
10730 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10731 using the information saved in PTR. */
10734 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
10736 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
10737 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
10738 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
10739 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
10740 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
10742 aarch64_override_options_internal (opts
);
10745 /* Implement TARGET_OPTION_PRINT. */
10748 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
10750 const struct processor
*cpu
10751 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
10752 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
10753 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
10754 std::string extension
10755 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
10757 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
10758 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
10759 arch
->name
, extension
.c_str ());
10762 static GTY(()) tree aarch64_previous_fndecl
;
10765 aarch64_reset_previous_fndecl (void)
10767 aarch64_previous_fndecl
= NULL
;
10770 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10771 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10772 make sure optab availability predicates are recomputed when necessary. */
10775 aarch64_save_restore_target_globals (tree new_tree
)
10777 if (TREE_TARGET_GLOBALS (new_tree
))
10778 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
10779 else if (new_tree
== target_option_default_node
)
10780 restore_target_globals (&default_target_globals
);
10782 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
10785 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
10786 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10787 of the function, if such exists. This function may be called multiple
10788 times on a single function so use aarch64_previous_fndecl to avoid
10789 setting up identical state. */
10792 aarch64_set_current_function (tree fndecl
)
10794 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
10797 tree old_tree
= (aarch64_previous_fndecl
10798 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
10801 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
10803 /* If current function has no attributes but the previous one did,
10804 use the default node. */
10805 if (!new_tree
&& old_tree
)
10806 new_tree
= target_option_default_node
;
10808 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
10809 the default have been handled by aarch64_save_restore_target_globals from
10810 aarch64_pragma_target_parse. */
10811 if (old_tree
== new_tree
)
10814 aarch64_previous_fndecl
= fndecl
;
10816 /* First set the target options. */
10817 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
10819 aarch64_save_restore_target_globals (new_tree
);
10822 /* Enum describing the various ways we can handle attributes.
10823 In many cases we can reuse the generic option handling machinery. */
10825 enum aarch64_attr_opt_type
10827 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
10828 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
10829 aarch64_attr_enum
, /* Attribute sets an enum variable. */
10830 aarch64_attr_custom
/* Attribute requires a custom handling function. */
10833 /* All the information needed to handle a target attribute.
10834 NAME is the name of the attribute.
10835 ATTR_TYPE specifies the type of behavior of the attribute as described
10836 in the definition of enum aarch64_attr_opt_type.
10837 ALLOW_NEG is true if the attribute supports a "no-" form.
10838 HANDLER is the function that takes the attribute string as an argument
10839 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
10840 OPT_NUM is the enum specifying the option that the attribute modifies.
10841 This is needed for attributes that mirror the behavior of a command-line
10842 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
10843 aarch64_attr_enum. */
10845 struct aarch64_attribute_info
10848 enum aarch64_attr_opt_type attr_type
;
10850 bool (*handler
) (const char *);
10851 enum opt_code opt_num
;
10854 /* Handle the ARCH_STR argument to the arch= target attribute. */
10857 aarch64_handle_attr_arch (const char *str
)
10859 const struct processor
*tmp_arch
= NULL
;
10860 enum aarch64_parse_opt_result parse_res
10861 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
10863 if (parse_res
== AARCH64_PARSE_OK
)
10865 gcc_assert (tmp_arch
);
10866 selected_arch
= tmp_arch
;
10867 explicit_arch
= selected_arch
->arch
;
10873 case AARCH64_PARSE_MISSING_ARG
:
10874 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
10876 case AARCH64_PARSE_INVALID_ARG
:
10877 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
10878 aarch64_print_hint_for_arch (str
);
10880 case AARCH64_PARSE_INVALID_FEATURE
:
10881 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
10884 gcc_unreachable ();
10890 /* Handle the argument CPU_STR to the cpu= target attribute. */
10893 aarch64_handle_attr_cpu (const char *str
)
10895 const struct processor
*tmp_cpu
= NULL
;
10896 enum aarch64_parse_opt_result parse_res
10897 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
10899 if (parse_res
== AARCH64_PARSE_OK
)
10901 gcc_assert (tmp_cpu
);
10902 selected_tune
= tmp_cpu
;
10903 explicit_tune_core
= selected_tune
->ident
;
10905 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
10906 explicit_arch
= selected_arch
->arch
;
10912 case AARCH64_PARSE_MISSING_ARG
:
10913 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
10915 case AARCH64_PARSE_INVALID_ARG
:
10916 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
10917 aarch64_print_hint_for_core (str
);
10919 case AARCH64_PARSE_INVALID_FEATURE
:
10920 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
10923 gcc_unreachable ();
10929 /* Handle the argument STR to the tune= target attribute. */
10932 aarch64_handle_attr_tune (const char *str
)
10934 const struct processor
*tmp_tune
= NULL
;
10935 enum aarch64_parse_opt_result parse_res
10936 = aarch64_parse_tune (str
, &tmp_tune
);
10938 if (parse_res
== AARCH64_PARSE_OK
)
10940 gcc_assert (tmp_tune
);
10941 selected_tune
= tmp_tune
;
10942 explicit_tune_core
= selected_tune
->ident
;
10948 case AARCH64_PARSE_INVALID_ARG
:
10949 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
10950 aarch64_print_hint_for_core (str
);
10953 gcc_unreachable ();
10959 /* Parse an architecture extensions target attribute string specified in STR.
10960 For example "+fp+nosimd". Show any errors if needed. Return TRUE
10961 if successful. Update aarch64_isa_flags to reflect the ISA features
10965 aarch64_handle_attr_isa_flags (char *str
)
10967 enum aarch64_parse_opt_result parse_res
;
10968 unsigned long isa_flags
= aarch64_isa_flags
;
10970 /* We allow "+nothing" in the beginning to clear out all architectural
10971 features if the user wants to handpick specific features. */
10972 if (strncmp ("+nothing", str
, 8) == 0)
10978 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
10980 if (parse_res
== AARCH64_PARSE_OK
)
10982 aarch64_isa_flags
= isa_flags
;
10988 case AARCH64_PARSE_MISSING_ARG
:
10989 error ("missing value in %<target()%> pragma or attribute");
10992 case AARCH64_PARSE_INVALID_FEATURE
:
10993 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
10997 gcc_unreachable ();
11003 /* The target attributes that we support. On top of these we also support just
11004 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11005 handled explicitly in aarch64_process_one_target_attr. */
11007 static const struct aarch64_attribute_info aarch64_attributes
[] =
11009 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11010 OPT_mgeneral_regs_only
},
11011 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11012 OPT_mfix_cortex_a53_835769
},
11013 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11014 OPT_mfix_cortex_a53_843419
},
11015 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11016 { "strict-align", aarch64_attr_mask
, false, NULL
, OPT_mstrict_align
},
11017 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11018 OPT_momit_leaf_frame_pointer
},
11019 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11020 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11022 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11023 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11025 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11026 OPT_msign_return_address_
},
11027 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11030 /* Parse ARG_STR which contains the definition of one target attribute.
11031 Show appropriate errors if any or return true if the attribute is valid. */
11034 aarch64_process_one_target_attr (char *arg_str
)
11036 bool invert
= false;
11038 size_t len
= strlen (arg_str
);
11042 error ("malformed %<target()%> pragma or attribute");
11046 char *str_to_check
= (char *) alloca (len
+ 1);
11047 strcpy (str_to_check
, arg_str
);
11049 /* Skip leading whitespace. */
11050 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11053 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11054 It is easier to detect and handle it explicitly here rather than going
11055 through the machinery for the rest of the target attributes in this
11057 if (*str_to_check
== '+')
11058 return aarch64_handle_attr_isa_flags (str_to_check
);
11060 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11065 char *arg
= strchr (str_to_check
, '=');
11067 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11068 and point ARG to "foo". */
11074 const struct aarch64_attribute_info
*p_attr
;
11075 bool found
= false;
11076 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11078 /* If the names don't match up, or the user has given an argument
11079 to an attribute that doesn't accept one, or didn't give an argument
11080 to an attribute that expects one, fail to match. */
11081 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11085 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11086 || p_attr
->attr_type
== aarch64_attr_enum
;
11088 if (attr_need_arg_p
^ (arg
!= NULL
))
11090 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11094 /* If the name matches but the attribute does not allow "no-" versions
11095 then we can't match. */
11096 if (invert
&& !p_attr
->allow_neg
)
11098 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11102 switch (p_attr
->attr_type
)
11104 /* Has a custom handler registered.
11105 For example, cpu=, arch=, tune=. */
11106 case aarch64_attr_custom
:
11107 gcc_assert (p_attr
->handler
);
11108 if (!p_attr
->handler (arg
))
11112 /* Either set or unset a boolean option. */
11113 case aarch64_attr_bool
:
11115 struct cl_decoded_option decoded
;
11117 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11118 CL_TARGET
, &decoded
);
11119 aarch64_handle_option (&global_options
, &global_options_set
,
11120 &decoded
, input_location
);
11123 /* Set or unset a bit in the target_flags. aarch64_handle_option
11124 should know what mask to apply given the option number. */
11125 case aarch64_attr_mask
:
11127 struct cl_decoded_option decoded
;
11128 /* We only need to specify the option number.
11129 aarch64_handle_option will know which mask to apply. */
11130 decoded
.opt_index
= p_attr
->opt_num
;
11131 decoded
.value
= !invert
;
11132 aarch64_handle_option (&global_options
, &global_options_set
,
11133 &decoded
, input_location
);
11136 /* Use the option setting machinery to set an option to an enum. */
11137 case aarch64_attr_enum
:
11142 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11143 &value
, CL_TARGET
);
11146 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11147 NULL
, DK_UNSPECIFIED
, input_location
,
11152 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11157 gcc_unreachable ();
11161 /* If we reached here we either have found an attribute and validated
11162 it or didn't match any. If we matched an attribute but its arguments
11163 were malformed we will have returned false already. */
11167 /* Count how many times the character C appears in
11168 NULL-terminated string STR. */
11170 static unsigned int
11171 num_occurences_in_str (char c
, char *str
)
11173 unsigned int res
= 0;
11174 while (*str
!= '\0')
11185 /* Parse the tree in ARGS that contains the target attribute information
11186 and update the global target options space. */
11189 aarch64_process_target_attr (tree args
)
11191 if (TREE_CODE (args
) == TREE_LIST
)
11195 tree head
= TREE_VALUE (args
);
11198 if (!aarch64_process_target_attr (head
))
11201 args
= TREE_CHAIN (args
);
11207 if (TREE_CODE (args
) != STRING_CST
)
11209 error ("attribute %<target%> argument not a string");
11213 size_t len
= strlen (TREE_STRING_POINTER (args
));
11214 char *str_to_check
= (char *) alloca (len
+ 1);
11215 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11219 error ("malformed %<target()%> pragma or attribute");
11223 /* Used to catch empty spaces between commas i.e.
11224 attribute ((target ("attr1,,attr2"))). */
11225 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11227 /* Handle multiple target attributes separated by ','. */
11228 char *token
= strtok (str_to_check
, ",");
11230 unsigned int num_attrs
= 0;
11234 if (!aarch64_process_one_target_attr (token
))
11236 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11240 token
= strtok (NULL
, ",");
11243 if (num_attrs
!= num_commas
+ 1)
11245 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11252 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11253 process attribute ((target ("..."))). */
11256 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11258 struct cl_target_option cur_target
;
11261 tree new_target
, new_optimize
;
11262 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11264 /* If what we're processing is the current pragma string then the
11265 target option node is already stored in target_option_current_node
11266 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11267 having to re-parse the string. This is especially useful to keep
11268 arm_neon.h compile times down since that header contains a lot
11269 of intrinsics enclosed in pragmas. */
11270 if (!existing_target
&& args
== current_target_pragma
)
11272 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11275 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11277 old_optimize
= build_optimization_node (&global_options
);
11278 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11280 /* If the function changed the optimization levels as well as setting
11281 target options, start with the optimizations specified. */
11282 if (func_optimize
&& func_optimize
!= old_optimize
)
11283 cl_optimization_restore (&global_options
,
11284 TREE_OPTIMIZATION (func_optimize
));
11286 /* Save the current target options to restore at the end. */
11287 cl_target_option_save (&cur_target
, &global_options
);
11289 /* If fndecl already has some target attributes applied to it, unpack
11290 them so that we add this attribute on top of them, rather than
11291 overwriting them. */
11292 if (existing_target
)
11294 struct cl_target_option
*existing_options
11295 = TREE_TARGET_OPTION (existing_target
);
11297 if (existing_options
)
11298 cl_target_option_restore (&global_options
, existing_options
);
11301 cl_target_option_restore (&global_options
,
11302 TREE_TARGET_OPTION (target_option_current_node
));
11304 ret
= aarch64_process_target_attr (args
);
11306 /* Set up any additional state. */
11309 aarch64_override_options_internal (&global_options
);
11310 /* Initialize SIMD builtins if we haven't already.
11311 Set current_target_pragma to NULL for the duration so that
11312 the builtin initialization code doesn't try to tag the functions
11313 being built with the attributes specified by any current pragma, thus
11314 going into an infinite recursion. */
11317 tree saved_current_target_pragma
= current_target_pragma
;
11318 current_target_pragma
= NULL
;
11319 aarch64_init_simd_builtins ();
11320 current_target_pragma
= saved_current_target_pragma
;
11322 new_target
= build_target_option_node (&global_options
);
11327 new_optimize
= build_optimization_node (&global_options
);
11331 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11333 if (old_optimize
!= new_optimize
)
11334 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11337 cl_target_option_restore (&global_options
, &cur_target
);
11339 if (old_optimize
!= new_optimize
)
11340 cl_optimization_restore (&global_options
,
11341 TREE_OPTIMIZATION (old_optimize
));
11345 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11346 tri-bool options (yes, no, don't care) and the default value is
11347 DEF, determine whether to reject inlining. */
11350 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11351 int dont_care
, int def
)
11353 /* If the callee doesn't care, always allow inlining. */
11354 if (callee
== dont_care
)
11357 /* If the caller doesn't care, always allow inlining. */
11358 if (caller
== dont_care
)
11361 /* Otherwise, allow inlining if either the callee and caller values
11362 agree, or if the callee is using the default value. */
11363 return (callee
== caller
|| callee
== def
);
11366 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11367 to inline CALLEE into CALLER based on target-specific info.
11368 Make sure that the caller and callee have compatible architectural
11369 features. Then go through the other possible target attributes
11370 and see if they can block inlining. Try not to reject always_inline
11371 callees unless they are incompatible architecturally. */
11374 aarch64_can_inline_p (tree caller
, tree callee
)
11376 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11377 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11379 /* If callee has no option attributes, then it is ok to inline. */
11383 struct cl_target_option
*caller_opts
11384 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11385 : target_option_default_node
);
11387 struct cl_target_option
*callee_opts
= TREE_TARGET_OPTION (callee_tree
);
11390 /* Callee's ISA flags should be a subset of the caller's. */
11391 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11392 != callee_opts
->x_aarch64_isa_flags
)
11395 /* Allow non-strict aligned functions inlining into strict
11397 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11398 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11399 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11400 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11403 bool always_inline
= lookup_attribute ("always_inline",
11404 DECL_ATTRIBUTES (callee
));
11406 /* If the architectural features match up and the callee is always_inline
11407 then the other attributes don't matter. */
11411 if (caller_opts
->x_aarch64_cmodel_var
11412 != callee_opts
->x_aarch64_cmodel_var
)
11415 if (caller_opts
->x_aarch64_tls_dialect
11416 != callee_opts
->x_aarch64_tls_dialect
)
11419 /* Honour explicit requests to workaround errata. */
11420 if (!aarch64_tribools_ok_for_inlining_p (
11421 caller_opts
->x_aarch64_fix_a53_err835769
,
11422 callee_opts
->x_aarch64_fix_a53_err835769
,
11423 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11426 if (!aarch64_tribools_ok_for_inlining_p (
11427 caller_opts
->x_aarch64_fix_a53_err843419
,
11428 callee_opts
->x_aarch64_fix_a53_err843419
,
11429 2, TARGET_FIX_ERR_A53_843419
))
11432 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11433 caller and calle and they don't match up, reject inlining. */
11434 if (!aarch64_tribools_ok_for_inlining_p (
11435 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11436 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11440 /* If the callee has specific tuning overrides, respect them. */
11441 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11442 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11445 /* If the user specified tuning override strings for the
11446 caller and callee and they don't match up, reject inlining.
11447 We just do a string compare here, we don't analyze the meaning
11448 of the string, as it would be too costly for little gain. */
11449 if (callee_opts
->x_aarch64_override_tune_string
11450 && caller_opts
->x_aarch64_override_tune_string
11451 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11452 caller_opts
->x_aarch64_override_tune_string
) != 0))
11458 /* Return true if SYMBOL_REF X binds locally. */
11461 aarch64_symbol_binds_local_p (const_rtx x
)
11463 return (SYMBOL_REF_DECL (x
)
11464 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11465 : SYMBOL_REF_LOCAL_P (x
));
11468 /* Return true if SYMBOL_REF X is thread local */
11470 aarch64_tls_symbol_p (rtx x
)
11472 if (! TARGET_HAVE_TLS
)
11475 if (GET_CODE (x
) != SYMBOL_REF
)
11478 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11481 /* Classify a TLS symbol into one of the TLS kinds. */
11482 enum aarch64_symbol_type
11483 aarch64_classify_tls_symbol (rtx x
)
11485 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11489 case TLS_MODEL_GLOBAL_DYNAMIC
:
11490 case TLS_MODEL_LOCAL_DYNAMIC
:
11491 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11493 case TLS_MODEL_INITIAL_EXEC
:
11494 switch (aarch64_cmodel
)
11496 case AARCH64_CMODEL_TINY
:
11497 case AARCH64_CMODEL_TINY_PIC
:
11498 return SYMBOL_TINY_TLSIE
;
11500 return SYMBOL_SMALL_TLSIE
;
11503 case TLS_MODEL_LOCAL_EXEC
:
11504 if (aarch64_tls_size
== 12)
11505 return SYMBOL_TLSLE12
;
11506 else if (aarch64_tls_size
== 24)
11507 return SYMBOL_TLSLE24
;
11508 else if (aarch64_tls_size
== 32)
11509 return SYMBOL_TLSLE32
;
11510 else if (aarch64_tls_size
== 48)
11511 return SYMBOL_TLSLE48
;
11513 gcc_unreachable ();
11515 case TLS_MODEL_EMULATED
:
11516 case TLS_MODEL_NONE
:
11517 return SYMBOL_FORCE_TO_MEM
;
11520 gcc_unreachable ();
11524 /* Return the correct method for accessing X + OFFSET, where X is either
11525 a SYMBOL_REF or LABEL_REF. */
11527 enum aarch64_symbol_type
11528 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11530 if (GET_CODE (x
) == LABEL_REF
)
11532 switch (aarch64_cmodel
)
11534 case AARCH64_CMODEL_LARGE
:
11535 return SYMBOL_FORCE_TO_MEM
;
11537 case AARCH64_CMODEL_TINY_PIC
:
11538 case AARCH64_CMODEL_TINY
:
11539 return SYMBOL_TINY_ABSOLUTE
;
11541 case AARCH64_CMODEL_SMALL_SPIC
:
11542 case AARCH64_CMODEL_SMALL_PIC
:
11543 case AARCH64_CMODEL_SMALL
:
11544 return SYMBOL_SMALL_ABSOLUTE
;
11547 gcc_unreachable ();
11551 if (GET_CODE (x
) == SYMBOL_REF
)
11553 if (aarch64_tls_symbol_p (x
))
11554 return aarch64_classify_tls_symbol (x
);
11556 switch (aarch64_cmodel
)
11558 case AARCH64_CMODEL_TINY
:
11559 /* When we retrieve symbol + offset address, we have to make sure
11560 the offset does not cause overflow of the final address. But
11561 we have no way of knowing the address of symbol at compile time
11562 so we can't accurately say if the distance between the PC and
11563 symbol + offset is outside the addressible range of +/-1M in the
11564 TINY code model. So we rely on images not being greater than
11565 1M and cap the offset at 1M and anything beyond 1M will have to
11566 be loaded using an alternative mechanism. Furthermore if the
11567 symbol is a weak reference to something that isn't known to
11568 resolve to a symbol in this module, then force to memory. */
11569 if ((SYMBOL_REF_WEAK (x
)
11570 && !aarch64_symbol_binds_local_p (x
))
11571 || !IN_RANGE (offset
, -1048575, 1048575))
11572 return SYMBOL_FORCE_TO_MEM
;
11573 return SYMBOL_TINY_ABSOLUTE
;
11575 case AARCH64_CMODEL_SMALL
:
11576 /* Same reasoning as the tiny code model, but the offset cap here is
11578 if ((SYMBOL_REF_WEAK (x
)
11579 && !aarch64_symbol_binds_local_p (x
))
11580 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11581 HOST_WIDE_INT_C (4294967264)))
11582 return SYMBOL_FORCE_TO_MEM
;
11583 return SYMBOL_SMALL_ABSOLUTE
;
11585 case AARCH64_CMODEL_TINY_PIC
:
11586 if (!aarch64_symbol_binds_local_p (x
))
11587 return SYMBOL_TINY_GOT
;
11588 return SYMBOL_TINY_ABSOLUTE
;
11590 case AARCH64_CMODEL_SMALL_SPIC
:
11591 case AARCH64_CMODEL_SMALL_PIC
:
11592 if (!aarch64_symbol_binds_local_p (x
))
11593 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11594 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11595 return SYMBOL_SMALL_ABSOLUTE
;
11597 case AARCH64_CMODEL_LARGE
:
11598 /* This is alright even in PIC code as the constant
11599 pool reference is always PC relative and within
11600 the same translation unit. */
11601 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11602 return SYMBOL_SMALL_ABSOLUTE
;
11604 return SYMBOL_FORCE_TO_MEM
;
11607 gcc_unreachable ();
11611 /* By default push everything into the constant pool. */
11612 return SYMBOL_FORCE_TO_MEM
;
11616 aarch64_constant_address_p (rtx x
)
11618 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11622 aarch64_legitimate_pic_operand_p (rtx x
)
11624 if (GET_CODE (x
) == SYMBOL_REF
11625 || (GET_CODE (x
) == CONST
11626 && GET_CODE (XEXP (x
, 0)) == PLUS
11627 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11633 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11634 that should be rematerialized rather than spilled. */
11637 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
11639 /* Support CSE and rematerialization of common constants. */
11640 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
) || GET_CODE (x
) == CONST_VECTOR
)
11643 /* Do not allow vector struct mode constants for Advanced SIMD.
11644 We could support 0 and -1 easily, but they need support in
11645 aarch64-simd.md. */
11646 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11647 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
11650 /* Do not allow wide int constants - this requires support in movti. */
11651 if (CONST_WIDE_INT_P (x
))
11654 /* Only accept variable-length vector constants if they can be
11657 ??? It would be possible to handle rematerialization of other
11658 constants via secondary reloads. */
11659 if (vec_flags
& VEC_ANY_SVE
)
11660 return aarch64_simd_valid_immediate (x
, NULL
);
11662 if (GET_CODE (x
) == HIGH
)
11665 /* Accept polynomial constants that can be calculated by using the
11666 destination of a move as the sole temporary. Constants that
11667 require a second temporary cannot be rematerialized (they can't be
11668 forced to memory and also aren't legitimate constants). */
11670 if (poly_int_rtx_p (x
, &offset
))
11671 return aarch64_offset_temporaries (false, offset
) <= 1;
11673 /* If an offset is being added to something else, we need to allow the
11674 base to be moved into the destination register, meaning that there
11675 are no free temporaries for the offset. */
11676 x
= strip_offset (x
, &offset
);
11677 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
11680 /* Do not allow const (plus (anchor_symbol, const_int)). */
11681 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
11684 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11685 so spilling them is better than rematerialization. */
11686 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
11689 /* Label references are always constant. */
11690 if (GET_CODE (x
) == LABEL_REF
)
11697 aarch64_load_tp (rtx target
)
11700 || GET_MODE (target
) != Pmode
11701 || !register_operand (target
, Pmode
))
11702 target
= gen_reg_rtx (Pmode
);
11704 /* Can return in any reg. */
11705 emit_insn (gen_aarch64_load_tp_hard (target
));
11709 /* On AAPCS systems, this is the "struct __va_list". */
11710 static GTY(()) tree va_list_type
;
11712 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11713 Return the type to use as __builtin_va_list.
11715 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11727 aarch64_build_builtin_va_list (void)
11730 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
11732 /* Create the type. */
11733 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
11734 /* Give it the required name. */
11735 va_list_name
= build_decl (BUILTINS_LOCATION
,
11737 get_identifier ("__va_list"),
11739 DECL_ARTIFICIAL (va_list_name
) = 1;
11740 TYPE_NAME (va_list_type
) = va_list_name
;
11741 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
11743 /* Create the fields. */
11744 f_stack
= build_decl (BUILTINS_LOCATION
,
11745 FIELD_DECL
, get_identifier ("__stack"),
11747 f_grtop
= build_decl (BUILTINS_LOCATION
,
11748 FIELD_DECL
, get_identifier ("__gr_top"),
11750 f_vrtop
= build_decl (BUILTINS_LOCATION
,
11751 FIELD_DECL
, get_identifier ("__vr_top"),
11753 f_groff
= build_decl (BUILTINS_LOCATION
,
11754 FIELD_DECL
, get_identifier ("__gr_offs"),
11755 integer_type_node
);
11756 f_vroff
= build_decl (BUILTINS_LOCATION
,
11757 FIELD_DECL
, get_identifier ("__vr_offs"),
11758 integer_type_node
);
11760 /* Tell tree-stdarg pass about our internal offset fields.
11761 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11762 purpose to identify whether the code is updating va_list internal
11763 offset fields through irregular way. */
11764 va_list_gpr_counter_field
= f_groff
;
11765 va_list_fpr_counter_field
= f_vroff
;
11767 DECL_ARTIFICIAL (f_stack
) = 1;
11768 DECL_ARTIFICIAL (f_grtop
) = 1;
11769 DECL_ARTIFICIAL (f_vrtop
) = 1;
11770 DECL_ARTIFICIAL (f_groff
) = 1;
11771 DECL_ARTIFICIAL (f_vroff
) = 1;
11773 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
11774 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
11775 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
11776 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
11777 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
11779 TYPE_FIELDS (va_list_type
) = f_stack
;
11780 DECL_CHAIN (f_stack
) = f_grtop
;
11781 DECL_CHAIN (f_grtop
) = f_vrtop
;
11782 DECL_CHAIN (f_vrtop
) = f_groff
;
11783 DECL_CHAIN (f_groff
) = f_vroff
;
11785 /* Compute its layout. */
11786 layout_type (va_list_type
);
11788 return va_list_type
;
11791 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
11793 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
11795 const CUMULATIVE_ARGS
*cum
;
11796 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
11797 tree stack
, grtop
, vrtop
, groff
, vroff
;
11799 int gr_save_area_size
= cfun
->va_list_gpr_size
;
11800 int vr_save_area_size
= cfun
->va_list_fpr_size
;
11803 cum
= &crtl
->args
.info
;
11804 if (cfun
->va_list_gpr_size
)
11805 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
11806 cfun
->va_list_gpr_size
);
11807 if (cfun
->va_list_fpr_size
)
11808 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
11809 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
11813 gcc_assert (cum
->aapcs_nvrn
== 0);
11814 vr_save_area_size
= 0;
11817 f_stack
= TYPE_FIELDS (va_list_type_node
);
11818 f_grtop
= DECL_CHAIN (f_stack
);
11819 f_vrtop
= DECL_CHAIN (f_grtop
);
11820 f_groff
= DECL_CHAIN (f_vrtop
);
11821 f_vroff
= DECL_CHAIN (f_groff
);
11823 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
11825 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
11827 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
11829 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
11831 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
11834 /* Emit code to initialize STACK, which points to the next varargs stack
11835 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
11836 by named arguments. STACK is 8-byte aligned. */
11837 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
11838 if (cum
->aapcs_stack_size
> 0)
11839 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
11840 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
11841 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
11843 /* Emit code to initialize GRTOP, the top of the GR save area.
11844 virtual_incoming_args_rtx should have been 16 byte aligned. */
11845 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
11846 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
11847 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
11849 /* Emit code to initialize VRTOP, the top of the VR save area.
11850 This address is gr_save_area_bytes below GRTOP, rounded
11851 down to the next 16-byte boundary. */
11852 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
11853 vr_offset
= ROUND_UP (gr_save_area_size
,
11854 STACK_BOUNDARY
/ BITS_PER_UNIT
);
11857 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
11858 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
11859 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
11861 /* Emit code to initialize GROFF, the offset from GRTOP of the
11862 next GPR argument. */
11863 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
11864 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
11865 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
11867 /* Likewise emit code to initialize VROFF, the offset from FTOP
11868 of the next VR argument. */
11869 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
11870 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
11871 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
11874 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
11877 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
11878 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
11882 bool is_ha
; /* is HFA or HVA. */
11883 bool dw_align
; /* double-word align. */
11884 machine_mode ag_mode
= VOIDmode
;
11888 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
11889 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
11890 HOST_WIDE_INT size
, rsize
, adjust
, align
;
11891 tree t
, u
, cond1
, cond2
;
11893 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
11895 type
= build_pointer_type (type
);
11897 mode
= TYPE_MODE (type
);
11899 f_stack
= TYPE_FIELDS (va_list_type_node
);
11900 f_grtop
= DECL_CHAIN (f_stack
);
11901 f_vrtop
= DECL_CHAIN (f_grtop
);
11902 f_groff
= DECL_CHAIN (f_vrtop
);
11903 f_vroff
= DECL_CHAIN (f_groff
);
11905 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
11906 f_stack
, NULL_TREE
);
11907 size
= int_size_in_bytes (type
);
11908 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
11912 if (aarch64_vfp_is_call_or_return_candidate (mode
,
11918 /* No frontends can create types with variable-sized modes, so we
11919 shouldn't be asked to pass or return them. */
11920 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
11922 /* TYPE passed in fp/simd registers. */
11924 aarch64_err_no_fpadvsimd (mode
, "varargs");
11926 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
11927 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
11928 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
11929 unshare_expr (valist
), f_vroff
, NULL_TREE
);
11931 rsize
= nregs
* UNITS_PER_VREG
;
11935 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
11936 adjust
= UNITS_PER_VREG
- ag_size
;
11938 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
11939 && size
< UNITS_PER_VREG
)
11941 adjust
= UNITS_PER_VREG
- size
;
11946 /* TYPE passed in general registers. */
11947 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
11948 unshare_expr (valist
), f_grtop
, NULL_TREE
);
11949 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
11950 unshare_expr (valist
), f_groff
, NULL_TREE
);
11951 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
11952 nregs
= rsize
/ UNITS_PER_WORD
;
11957 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
11958 && size
< UNITS_PER_WORD
)
11960 adjust
= UNITS_PER_WORD
- size
;
11964 /* Get a local temporary for the field value. */
11965 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
11967 /* Emit code to branch if off >= 0. */
11968 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
11969 build_int_cst (TREE_TYPE (off
), 0));
11970 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
11974 /* Emit: offs = (offs + 15) & -16. */
11975 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
11976 build_int_cst (TREE_TYPE (off
), 15));
11977 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
11978 build_int_cst (TREE_TYPE (off
), -16));
11979 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
11984 /* Update ap.__[g|v]r_offs */
11985 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
11986 build_int_cst (TREE_TYPE (off
), rsize
));
11987 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
11991 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
11993 /* [cond2] if (ap.__[g|v]r_offs > 0) */
11994 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
11995 build_int_cst (TREE_TYPE (f_off
), 0));
11996 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
11998 /* String up: make sure the assignment happens before the use. */
11999 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12000 COND_EXPR_ELSE (cond1
) = t
;
12002 /* Prepare the trees handling the argument that is passed on the stack;
12003 the top level node will store in ON_STACK. */
12004 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12007 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12008 t
= fold_convert (intDI_type_node
, arg
);
12009 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
12010 build_int_cst (TREE_TYPE (t
), 15));
12011 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12012 build_int_cst (TREE_TYPE (t
), -16));
12013 t
= fold_convert (TREE_TYPE (arg
), t
);
12014 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12018 /* Advance ap.__stack */
12019 t
= fold_convert (intDI_type_node
, arg
);
12020 t
= build2 (PLUS_EXPR
, TREE_TYPE (t
), t
,
12021 build_int_cst (TREE_TYPE (t
), size
+ 7));
12022 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12023 build_int_cst (TREE_TYPE (t
), -8));
12024 t
= fold_convert (TREE_TYPE (arg
), t
);
12025 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12026 /* String up roundup and advance. */
12028 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12029 /* String up with arg */
12030 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12031 /* Big-endianness related address adjustment. */
12032 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12033 && size
< UNITS_PER_WORD
)
12035 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12036 size_int (UNITS_PER_WORD
- size
));
12037 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12040 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12041 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12043 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12046 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12047 build_int_cst (TREE_TYPE (off
), adjust
));
12049 t
= fold_convert (sizetype
, t
);
12050 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12054 /* type ha; // treat as "struct {ftype field[n];}"
12055 ... [computing offs]
12056 for (i = 0; i <nregs; ++i, offs += 16)
12057 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12060 tree tmp_ha
, field_t
, field_ptr_t
;
12062 /* Declare a local variable. */
12063 tmp_ha
= create_tmp_var_raw (type
, "ha");
12064 gimple_add_tmp_var (tmp_ha
);
12066 /* Establish the base type. */
12070 field_t
= float_type_node
;
12071 field_ptr_t
= float_ptr_type_node
;
12074 field_t
= double_type_node
;
12075 field_ptr_t
= double_ptr_type_node
;
12078 field_t
= long_double_type_node
;
12079 field_ptr_t
= long_double_ptr_type_node
;
12082 field_t
= aarch64_fp16_type_node
;
12083 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12088 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12089 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12090 field_ptr_t
= build_pointer_type (field_t
);
12097 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12098 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12100 t
= fold_convert (field_ptr_t
, addr
);
12101 t
= build2 (MODIFY_EXPR
, field_t
,
12102 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12103 build1 (INDIRECT_REF
, field_t
, t
));
12105 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12106 for (i
= 1; i
< nregs
; ++i
)
12108 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12109 u
= fold_convert (field_ptr_t
, addr
);
12110 u
= build2 (MODIFY_EXPR
, field_t
,
12111 build2 (MEM_REF
, field_t
, tmp_ha
,
12112 build_int_cst (field_ptr_t
,
12114 int_size_in_bytes (field_t
)))),
12115 build1 (INDIRECT_REF
, field_t
, u
));
12116 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12119 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12120 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12123 COND_EXPR_ELSE (cond2
) = t
;
12124 addr
= fold_convert (build_pointer_type (type
), cond1
);
12125 addr
= build_va_arg_indirect_ref (addr
);
12128 addr
= build_va_arg_indirect_ref (addr
);
12133 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12136 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12137 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12140 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12141 CUMULATIVE_ARGS local_cum
;
12142 int gr_saved
= cfun
->va_list_gpr_size
;
12143 int vr_saved
= cfun
->va_list_fpr_size
;
12145 /* The caller has advanced CUM up to, but not beyond, the last named
12146 argument. Advance a local copy of CUM past the last "real" named
12147 argument, to find out how many registers are left over. */
12149 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12151 /* Found out how many registers we need to save.
12152 Honor tree-stdvar analysis results. */
12153 if (cfun
->va_list_gpr_size
)
12154 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12155 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12156 if (cfun
->va_list_fpr_size
)
12157 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12158 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12162 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12172 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12173 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12174 - gr_saved
* UNITS_PER_WORD
);
12175 mem
= gen_frame_mem (BLKmode
, ptr
);
12176 set_mem_alias_set (mem
, get_varargs_alias_set ());
12178 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12183 /* We can't use move_block_from_reg, because it will use
12184 the wrong mode, storing D regs only. */
12185 machine_mode mode
= TImode
;
12186 int off
, i
, vr_start
;
12188 /* Set OFF to the offset from virtual_incoming_args_rtx of
12189 the first vector register. The VR save area lies below
12190 the GR one, and is aligned to 16 bytes. */
12191 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12192 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12193 off
-= vr_saved
* UNITS_PER_VREG
;
12195 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12196 for (i
= 0; i
< vr_saved
; ++i
)
12200 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12201 mem
= gen_frame_mem (mode
, ptr
);
12202 set_mem_alias_set (mem
, get_varargs_alias_set ());
12203 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12204 off
+= UNITS_PER_VREG
;
12209 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12210 any complication of having crtl->args.pretend_args_size changed. */
12211 cfun
->machine
->frame
.saved_varargs_size
12212 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12213 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12214 + vr_saved
* UNITS_PER_VREG
);
12218 aarch64_conditional_register_usage (void)
12223 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12226 call_used_regs
[i
] = 1;
12230 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12233 call_used_regs
[i
] = 1;
12237 /* Walk down the type tree of TYPE counting consecutive base elements.
12238 If *MODEP is VOIDmode, then set it to the first valid floating point
12239 type. If a non-floating point type is found, or if a floating point
12240 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12241 otherwise return the count in the sub-tree. */
12243 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12246 HOST_WIDE_INT size
;
12248 switch (TREE_CODE (type
))
12251 mode
= TYPE_MODE (type
);
12252 if (mode
!= DFmode
&& mode
!= SFmode
12253 && mode
!= TFmode
&& mode
!= HFmode
)
12256 if (*modep
== VOIDmode
)
12259 if (*modep
== mode
)
12265 mode
= TYPE_MODE (TREE_TYPE (type
));
12266 if (mode
!= DFmode
&& mode
!= SFmode
12267 && mode
!= TFmode
&& mode
!= HFmode
)
12270 if (*modep
== VOIDmode
)
12273 if (*modep
== mode
)
12279 /* Use V2SImode and V4SImode as representatives of all 64-bit
12280 and 128-bit vector types. */
12281 size
= int_size_in_bytes (type
);
12294 if (*modep
== VOIDmode
)
12297 /* Vector modes are considered to be opaque: two vectors are
12298 equivalent for the purposes of being homogeneous aggregates
12299 if they are the same size. */
12300 if (*modep
== mode
)
12308 tree index
= TYPE_DOMAIN (type
);
12310 /* Can't handle incomplete types nor sizes that are not
12312 if (!COMPLETE_TYPE_P (type
)
12313 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12316 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12319 || !TYPE_MAX_VALUE (index
)
12320 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12321 || !TYPE_MIN_VALUE (index
)
12322 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12326 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12327 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12329 /* There must be no padding. */
12330 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12331 count
* GET_MODE_BITSIZE (*modep
)))
12343 /* Can't handle incomplete types nor sizes that are not
12345 if (!COMPLETE_TYPE_P (type
)
12346 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12349 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12351 if (TREE_CODE (field
) != FIELD_DECL
)
12354 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12357 count
+= sub_count
;
12360 /* There must be no padding. */
12361 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12362 count
* GET_MODE_BITSIZE (*modep
)))
12369 case QUAL_UNION_TYPE
:
12371 /* These aren't very interesting except in a degenerate case. */
12376 /* Can't handle incomplete types nor sizes that are not
12378 if (!COMPLETE_TYPE_P (type
)
12379 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12382 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12384 if (TREE_CODE (field
) != FIELD_DECL
)
12387 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12390 count
= count
> sub_count
? count
: sub_count
;
12393 /* There must be no padding. */
12394 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12395 count
* GET_MODE_BITSIZE (*modep
)))
12408 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12409 type as described in AAPCS64 \S 4.1.2.
12411 See the comment above aarch64_composite_type_p for the notes on MODE. */
12414 aarch64_short_vector_p (const_tree type
,
12417 poly_int64 size
= -1;
12419 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12420 size
= int_size_in_bytes (type
);
12421 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12422 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12423 size
= GET_MODE_SIZE (mode
);
12425 return known_eq (size
, 8) || known_eq (size
, 16);
12428 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12429 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12430 array types. The C99 floating-point complex types are also considered
12431 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12432 types, which are GCC extensions and out of the scope of AAPCS64, are
12433 treated as composite types here as well.
12435 Note that MODE itself is not sufficient in determining whether a type
12436 is such a composite type or not. This is because
12437 stor-layout.c:compute_record_mode may have already changed the MODE
12438 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12439 structure with only one field may have its MODE set to the mode of the
12440 field. Also an integer mode whose size matches the size of the
12441 RECORD_TYPE type may be used to substitute the original mode
12442 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12443 solely relied on. */
12446 aarch64_composite_type_p (const_tree type
,
12449 if (aarch64_short_vector_p (type
, mode
))
12452 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12455 if (mode
== BLKmode
12456 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12457 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12463 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12464 shall be passed or returned in simd/fp register(s) (providing these
12465 parameter passing registers are available).
12467 Upon successful return, *COUNT returns the number of needed registers,
12468 *BASE_MODE returns the mode of the individual register and when IS_HAF
12469 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12470 floating-point aggregate or a homogeneous short-vector aggregate. */
12473 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12475 machine_mode
*base_mode
,
12479 machine_mode new_mode
= VOIDmode
;
12480 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12482 if (is_ha
!= NULL
) *is_ha
= false;
12484 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12485 || aarch64_short_vector_p (type
, mode
))
12490 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12492 if (is_ha
!= NULL
) *is_ha
= true;
12494 new_mode
= GET_MODE_INNER (mode
);
12496 else if (type
&& composite_p
)
12498 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12500 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12502 if (is_ha
!= NULL
) *is_ha
= true;
12511 *base_mode
= new_mode
;
12515 /* Implement TARGET_STRUCT_VALUE_RTX. */
12518 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12519 int incoming ATTRIBUTE_UNUSED
)
12521 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12524 /* Implements target hook vector_mode_supported_p. */
12526 aarch64_vector_mode_supported_p (machine_mode mode
)
12528 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12529 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12532 /* Return appropriate SIMD container
12533 for MODE within a vector of WIDTH bits. */
12534 static machine_mode
12535 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12537 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12553 return VNx16QImode
;
12558 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12561 if (known_eq (width
, 128))
12601 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12602 static machine_mode
12603 aarch64_preferred_simd_mode (scalar_mode mode
)
12605 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12606 return aarch64_simd_container_mode (mode
, bits
);
12609 /* Return a list of possible vector sizes for the vectorizer
12610 to iterate over. */
12612 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12615 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12616 sizes
->safe_push (16);
12617 sizes
->safe_push (8);
12620 /* Implement TARGET_MANGLE_TYPE. */
12622 static const char *
12623 aarch64_mangle_type (const_tree type
)
12625 /* The AArch64 ABI documents say that "__va_list" has to be
12626 managled as if it is in the "std" namespace. */
12627 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12628 return "St9__va_list";
12630 /* Half-precision float. */
12631 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12634 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12636 if (TYPE_NAME (type
) != NULL
)
12637 return aarch64_mangle_builtin_type (type
);
12639 /* Use the default mangling. */
12643 /* Find the first rtx_insn before insn that will generate an assembly
12647 aarch64_prev_real_insn (rtx_insn
*insn
)
12654 insn
= prev_real_insn (insn
);
12656 while (insn
&& recog_memoized (insn
) < 0);
12662 is_madd_op (enum attr_type t1
)
12665 /* A number of these may be AArch32 only. */
12666 enum attr_type mlatypes
[] = {
12667 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
12668 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
12669 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
12672 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
12674 if (t1
== mlatypes
[i
])
12681 /* Check if there is a register dependency between a load and the insn
12682 for which we hold recog_data. */
12685 dep_between_memop_and_curr (rtx memop
)
12690 gcc_assert (GET_CODE (memop
) == SET
);
12692 if (!REG_P (SET_DEST (memop
)))
12695 load_reg
= SET_DEST (memop
);
12696 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
12698 rtx operand
= recog_data
.operand
[opno
];
12699 if (REG_P (operand
)
12700 && reg_overlap_mentioned_p (load_reg
, operand
))
12708 /* When working around the Cortex-A53 erratum 835769,
12709 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12710 instruction and has a preceding memory instruction such that a NOP
12711 should be inserted between them. */
12714 aarch64_madd_needs_nop (rtx_insn
* insn
)
12716 enum attr_type attr_type
;
12720 if (!TARGET_FIX_ERR_A53_835769
)
12723 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
12726 attr_type
= get_attr_type (insn
);
12727 if (!is_madd_op (attr_type
))
12730 prev
= aarch64_prev_real_insn (insn
);
12731 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12732 Restore recog state to INSN to avoid state corruption. */
12733 extract_constrain_insn_cached (insn
);
12735 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
12738 body
= single_set (prev
);
12740 /* If the previous insn is a memory op and there is no dependency between
12741 it and the DImode madd, emit a NOP between them. If body is NULL then we
12742 have a complex memory operation, probably a load/store pair.
12743 Be conservative for now and emit a NOP. */
12744 if (GET_MODE (recog_data
.operand
[0]) == DImode
12745 && (!body
|| !dep_between_memop_and_curr (body
)))
12753 /* Implement FINAL_PRESCAN_INSN. */
12756 aarch64_final_prescan_insn (rtx_insn
*insn
)
12758 if (aarch64_madd_needs_nop (insn
))
12759 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
12763 /* Return the equivalent letter for size. */
12765 sizetochar (int size
)
12769 case 64: return 'd';
12770 case 32: return 's';
12771 case 16: return 'h';
12772 case 8 : return 'b';
12773 default: gcc_unreachable ();
12777 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12781 aarch64_sve_index_immediate_p (rtx base_or_step
)
12783 return (CONST_INT_P (base_or_step
)
12784 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
12787 /* Return true if X is a valid immediate for the SVE ADD and SUB
12788 instructions. Negate X first if NEGATE_P is true. */
12791 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
12795 if (!const_vec_duplicate_p (x
, &elt
)
12796 || !CONST_INT_P (elt
))
12799 HOST_WIDE_INT val
= INTVAL (elt
);
12802 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
12805 return IN_RANGE (val
, 0, 0xff);
12806 return IN_RANGE (val
, 0, 0xff00);
12809 /* Return true if X is a valid immediate operand for an SVE logical
12810 instruction such as AND. */
12813 aarch64_sve_bitmask_immediate_p (rtx x
)
12817 return (const_vec_duplicate_p (x
, &elt
)
12818 && CONST_INT_P (elt
)
12819 && aarch64_bitmask_imm (INTVAL (elt
),
12820 GET_MODE_INNER (GET_MODE (x
))));
12823 /* Return true if X is a valid immediate for the SVE DUP and CPY
12827 aarch64_sve_dup_immediate_p (rtx x
)
12831 if (!const_vec_duplicate_p (x
, &elt
)
12832 || !CONST_INT_P (elt
))
12835 HOST_WIDE_INT val
= INTVAL (elt
);
12837 return IN_RANGE (val
, -0x80, 0x7f);
12838 return IN_RANGE (val
, -0x8000, 0x7f00);
12841 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
12842 SIGNED_P says whether the operand is signed rather than unsigned. */
12845 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
12849 return (const_vec_duplicate_p (x
, &elt
)
12850 && CONST_INT_P (elt
)
12852 ? IN_RANGE (INTVAL (elt
), -16, 15)
12853 : IN_RANGE (INTVAL (elt
), 0, 127)));
12856 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
12857 instruction. Negate X first if NEGATE_P is true. */
12860 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
12865 if (!const_vec_duplicate_p (x
, &elt
)
12866 || GET_CODE (elt
) != CONST_DOUBLE
)
12869 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
12872 r
= real_value_negate (&r
);
12874 if (real_equal (&r
, &dconst1
))
12876 if (real_equal (&r
, &dconsthalf
))
12881 /* Return true if X is a valid immediate operand for an SVE FMUL
12885 aarch64_sve_float_mul_immediate_p (rtx x
)
12889 /* GCC will never generate a multiply with an immediate of 2, so there is no
12890 point testing for it (even though it is a valid constant). */
12891 return (const_vec_duplicate_p (x
, &elt
)
12892 && GET_CODE (elt
) == CONST_DOUBLE
12893 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
12896 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
12897 for the Advanced SIMD operation described by WHICH and INSN. If INFO
12898 is nonnull, use it to describe valid immediates. */
12900 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
12901 simd_immediate_info
*info
,
12902 enum simd_immediate_check which
,
12903 simd_immediate_info::insn_type insn
)
12905 /* Try a 4-byte immediate with LSL. */
12906 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
12907 if ((val32
& (0xff << shift
)) == val32
)
12910 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
12911 simd_immediate_info::LSL
, shift
);
12915 /* Try a 2-byte immediate with LSL. */
12916 unsigned int imm16
= val32
& 0xffff;
12917 if (imm16
== (val32
>> 16))
12918 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
12919 if ((imm16
& (0xff << shift
)) == imm16
)
12922 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
12923 simd_immediate_info::LSL
, shift
);
12927 /* Try a 4-byte immediate with MSL, except for cases that MVN
12929 if (which
== AARCH64_CHECK_MOV
)
12930 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
12932 unsigned int low
= (1 << shift
) - 1;
12933 if (((val32
& (0xff << shift
)) | low
) == val32
)
12936 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
12937 simd_immediate_info::MSL
, shift
);
12945 /* Return true if replicating VAL64 is a valid immediate for the
12946 Advanced SIMD operation described by WHICH. If INFO is nonnull,
12947 use it to describe valid immediates. */
12949 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
12950 simd_immediate_info
*info
,
12951 enum simd_immediate_check which
)
12953 unsigned int val32
= val64
& 0xffffffff;
12954 unsigned int val16
= val64
& 0xffff;
12955 unsigned int val8
= val64
& 0xff;
12957 if (val32
== (val64
>> 32))
12959 if ((which
& AARCH64_CHECK_ORR
) != 0
12960 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
12961 simd_immediate_info::MOV
))
12964 if ((which
& AARCH64_CHECK_BIC
) != 0
12965 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
12966 simd_immediate_info::MVN
))
12969 /* Try using a replicated byte. */
12970 if (which
== AARCH64_CHECK_MOV
12971 && val16
== (val32
>> 16)
12972 && val8
== (val16
>> 8))
12975 *info
= simd_immediate_info (QImode
, val8
);
12980 /* Try using a bit-to-bytemask. */
12981 if (which
== AARCH64_CHECK_MOV
)
12984 for (i
= 0; i
< 64; i
+= 8)
12986 unsigned char byte
= (val64
>> i
) & 0xff;
12987 if (byte
!= 0 && byte
!= 0xff)
12993 *info
= simd_immediate_info (DImode
, val64
);
13000 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13001 instruction. If INFO is nonnull, use it to describe valid immediates. */
13004 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13005 simd_immediate_info
*info
)
13007 scalar_int_mode mode
= DImode
;
13008 unsigned int val32
= val64
& 0xffffffff;
13009 if (val32
== (val64
>> 32))
13012 unsigned int val16
= val32
& 0xffff;
13013 if (val16
== (val32
>> 16))
13016 unsigned int val8
= val16
& 0xff;
13017 if (val8
== (val16
>> 8))
13021 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13022 if (IN_RANGE (val
, -0x80, 0x7f))
13024 /* DUP with no shift. */
13026 *info
= simd_immediate_info (mode
, val
);
13029 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13031 /* DUP with LSL #8. */
13033 *info
= simd_immediate_info (mode
, val
);
13036 if (aarch64_bitmask_imm (val64
, mode
))
13040 *info
= simd_immediate_info (mode
, val
);
13046 /* Return true if OP is a valid SIMD immediate for the operation
13047 described by WHICH. If INFO is nonnull, use it to describe valid
13050 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13051 enum simd_immediate_check which
)
13053 machine_mode mode
= GET_MODE (op
);
13054 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13055 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13058 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13059 rtx elt
= NULL
, base
, step
;
13060 unsigned int n_elts
;
13061 if (const_vec_duplicate_p (op
, &elt
))
13063 else if ((vec_flags
& VEC_SVE_DATA
)
13064 && const_vec_series_p (op
, &base
, &step
))
13066 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13067 if (!aarch64_sve_index_immediate_p (base
)
13068 || !aarch64_sve_index_immediate_p (step
))
13072 *info
= simd_immediate_info (elt_mode
, base
, step
);
13075 else if (GET_CODE (op
) == CONST_VECTOR
13076 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13077 /* N_ELTS set above. */;
13081 /* Handle PFALSE and PTRUE. */
13082 if (vec_flags
& VEC_SVE_PRED
)
13083 return (op
== CONST0_RTX (mode
)
13084 || op
== CONSTM1_RTX (mode
));
13086 scalar_float_mode elt_float_mode
;
13088 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
)
13089 && (aarch64_float_const_zero_rtx_p (elt
)
13090 || aarch64_float_const_representable_p (elt
)))
13093 *info
= simd_immediate_info (elt_float_mode
, elt
);
13097 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13101 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13103 /* Expand the vector constant out into a byte vector, with the least
13104 significant byte of the register first. */
13105 auto_vec
<unsigned char, 16> bytes
;
13106 bytes
.reserve (n_elts
* elt_size
);
13107 for (unsigned int i
= 0; i
< n_elts
; i
++)
13109 if (!elt
|| n_elts
!= 1)
13110 /* The vector is provided in gcc endian-neutral fashion.
13111 For aarch64_be, it must be laid out in the vector register
13112 in reverse order. */
13113 elt
= CONST_VECTOR_ELT (op
, BYTES_BIG_ENDIAN
? (n_elts
- 1 - i
) : i
);
13115 if (elt_mode
!= elt_int_mode
)
13116 elt
= gen_lowpart (elt_int_mode
, elt
);
13118 if (!CONST_INT_P (elt
))
13121 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13122 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13124 bytes
.quick_push (elt_val
& 0xff);
13125 elt_val
>>= BITS_PER_UNIT
;
13129 /* The immediate must repeat every eight bytes. */
13130 unsigned int nbytes
= bytes
.length ();
13131 for (unsigned i
= 8; i
< nbytes
; ++i
)
13132 if (bytes
[i
] != bytes
[i
- 8])
13135 /* Get the repeating 8-byte value as an integer. No endian correction
13136 is needed here because bytes is already in lsb-first order. */
13137 unsigned HOST_WIDE_INT val64
= 0;
13138 for (unsigned int i
= 0; i
< 8; i
++)
13139 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13140 << (i
* BITS_PER_UNIT
));
13142 if (vec_flags
& VEC_SVE_DATA
)
13143 return aarch64_sve_valid_immediate (val64
, info
);
13145 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13148 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13149 has a step in the range of INDEX. Return the index expression if so,
13150 otherwise return null. */
13152 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13155 if (const_vec_series_p (x
, &base
, &step
)
13156 && base
== const0_rtx
13157 && aarch64_sve_index_immediate_p (step
))
13162 /* Check of immediate shift constants are within range. */
13164 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13166 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13168 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13170 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13173 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13174 operation of width WIDTH at bit position POS. */
13177 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13179 gcc_assert (CONST_INT_P (width
));
13180 gcc_assert (CONST_INT_P (pos
));
13182 unsigned HOST_WIDE_INT mask
13183 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13184 return GEN_INT (mask
<< UINTVAL (pos
));
13188 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13190 if (GET_CODE (x
) == HIGH
13191 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13194 if (CONST_INT_P (x
))
13197 if (VECTOR_MODE_P (GET_MODE (x
)))
13198 return aarch64_simd_valid_immediate (x
, NULL
);
13200 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13203 if (aarch64_sve_cnt_immediate_p (x
))
13206 return aarch64_classify_symbolic_expression (x
)
13207 == SYMBOL_TINY_ABSOLUTE
;
13210 /* Return a const_int vector of VAL. */
13212 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13214 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13215 return gen_const_vec_duplicate (mode
, c
);
13218 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13221 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13223 machine_mode vmode
;
13225 vmode
= aarch64_simd_container_mode (mode
, 64);
13226 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13227 return aarch64_simd_valid_immediate (op_v
, NULL
);
13230 /* Construct and return a PARALLEL RTX vector with elements numbering the
13231 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13232 the vector - from the perspective of the architecture. This does not
13233 line up with GCC's perspective on lane numbers, so we end up with
13234 different masks depending on our target endian-ness. The diagram
13235 below may help. We must draw the distinction when building masks
13236 which select one half of the vector. An instruction selecting
13237 architectural low-lanes for a big-endian target, must be described using
13238 a mask selecting GCC high-lanes.
13240 Big-Endian Little-Endian
13242 GCC 0 1 2 3 3 2 1 0
13243 | x | x | x | x | | x | x | x | x |
13244 Architecture 3 2 1 0 3 2 1 0
13246 Low Mask: { 2, 3 } { 0, 1 }
13247 High Mask: { 0, 1 } { 2, 3 }
13249 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13252 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13254 rtvec v
= rtvec_alloc (nunits
/ 2);
13255 int high_base
= nunits
/ 2;
13261 if (BYTES_BIG_ENDIAN
)
13262 base
= high
? low_base
: high_base
;
13264 base
= high
? high_base
: low_base
;
13266 for (i
= 0; i
< nunits
/ 2; i
++)
13267 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13269 t1
= gen_rtx_PARALLEL (mode
, v
);
13273 /* Check OP for validity as a PARALLEL RTX vector with elements
13274 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13275 from the perspective of the architecture. See the diagram above
13276 aarch64_simd_vect_par_cnst_half for more details. */
13279 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13283 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13286 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13287 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13288 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13291 if (count_op
!= count_ideal
)
13294 for (i
= 0; i
< count_ideal
; i
++)
13296 rtx elt_op
= XVECEXP (op
, 0, i
);
13297 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13299 if (!CONST_INT_P (elt_op
)
13300 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13306 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13307 HIGH (exclusive). */
13309 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13312 HOST_WIDE_INT lane
;
13313 gcc_assert (CONST_INT_P (operand
));
13314 lane
= INTVAL (operand
);
13316 if (lane
< low
|| lane
>= high
)
13319 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13321 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13325 /* Peform endian correction on lane number N, which indexes a vector
13326 of mode MODE, and return the result as an SImode rtx. */
13329 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13331 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13334 /* Return TRUE if OP is a valid vector addressing mode. */
13337 aarch64_simd_mem_operand_p (rtx op
)
13339 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13340 || REG_P (XEXP (op
, 0)));
13343 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13346 aarch64_sve_ld1r_operand_p (rtx op
)
13348 struct aarch64_address_info addr
;
13352 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13353 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13354 && addr
.type
== ADDRESS_REG_IMM
13355 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13358 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13359 The conditions for STR are the same. */
13361 aarch64_sve_ldr_operand_p (rtx op
)
13363 struct aarch64_address_info addr
;
13366 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13367 false, ADDR_QUERY_ANY
)
13368 && addr
.type
== ADDRESS_REG_IMM
);
13371 /* Emit a register copy from operand to operand, taking care not to
13372 early-clobber source registers in the process.
13374 COUNT is the number of components into which the copy needs to be
13377 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13378 unsigned int count
)
13381 int rdest
= REGNO (operands
[0]);
13382 int rsrc
= REGNO (operands
[1]);
13384 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13386 for (i
= 0; i
< count
; i
++)
13387 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13388 gen_rtx_REG (mode
, rsrc
+ i
));
13390 for (i
= 0; i
< count
; i
++)
13391 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13392 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13395 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13396 one of VSTRUCT modes: OI, CI, or XI. */
13398 aarch64_simd_attr_length_rglist (machine_mode mode
)
13400 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13401 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13404 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13405 alignment of a vector to 128 bits. SVE predicates have an alignment of
13407 static HOST_WIDE_INT
13408 aarch64_simd_vector_alignment (const_tree type
)
13410 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13411 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13412 be set for non-predicate vectors of booleans. Modes are the most
13413 direct way we have of identifying real SVE predicate types. */
13414 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13415 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13416 return MIN (align
, 128);
13419 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13420 static HOST_WIDE_INT
13421 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13423 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13425 /* If the length of the vector is fixed, try to align to that length,
13426 otherwise don't try to align at all. */
13427 HOST_WIDE_INT result
;
13428 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13429 result
= TYPE_ALIGN (TREE_TYPE (type
));
13432 return TYPE_ALIGN (type
);
13435 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13437 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13442 /* For fixed-length vectors, check that the vectorizer will aim for
13443 full-vector alignment. This isn't true for generic GCC vectors
13444 that are wider than the ABI maximum of 128 bits. */
13445 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13446 && (wi::to_widest (TYPE_SIZE (type
))
13447 != aarch64_vectorize_preferred_vector_alignment (type
)))
13450 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13454 /* Return true if the vector misalignment factor is supported by the
13457 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13458 const_tree type
, int misalignment
,
13461 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13463 /* Return if movmisalign pattern is not supported for this mode. */
13464 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13467 /* Misalignment factor is unknown at compile time. */
13468 if (misalignment
== -1)
13471 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13475 /* If VALS is a vector constant that can be loaded into a register
13476 using DUP, generate instructions to do so and return an RTX to
13477 assign to the register. Otherwise return NULL_RTX. */
13479 aarch64_simd_dup_constant (rtx vals
)
13481 machine_mode mode
= GET_MODE (vals
);
13482 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13485 if (!const_vec_duplicate_p (vals
, &x
))
13488 /* We can load this constant by using DUP and a constant in a
13489 single ARM register. This will be cheaper than a vector
13491 x
= copy_to_mode_reg (inner_mode
, x
);
13492 return gen_vec_duplicate (mode
, x
);
13496 /* Generate code to load VALS, which is a PARALLEL containing only
13497 constants (for vec_init) or CONST_VECTOR, efficiently into a
13498 register. Returns an RTX to copy into the register, or NULL_RTX
13499 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13501 aarch64_simd_make_constant (rtx vals
)
13503 machine_mode mode
= GET_MODE (vals
);
13505 rtx const_vec
= NULL_RTX
;
13509 if (GET_CODE (vals
) == CONST_VECTOR
)
13511 else if (GET_CODE (vals
) == PARALLEL
)
13513 /* A CONST_VECTOR must contain only CONST_INTs and
13514 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13515 Only store valid constants in a CONST_VECTOR. */
13516 int n_elts
= XVECLEN (vals
, 0);
13517 for (i
= 0; i
< n_elts
; ++i
)
13519 rtx x
= XVECEXP (vals
, 0, i
);
13520 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13523 if (n_const
== n_elts
)
13524 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13527 gcc_unreachable ();
13529 if (const_vec
!= NULL_RTX
13530 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13531 /* Load using MOVI/MVNI. */
13533 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13534 /* Loaded using DUP. */
13536 else if (const_vec
!= NULL_RTX
)
13537 /* Load from constant pool. We can not take advantage of single-cycle
13538 LD1 because we need a PC-relative addressing mode. */
13541 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13542 We can not construct an initializer. */
13546 /* Expand a vector initialisation sequence, such that TARGET is
13547 initialised to contain VALS. */
13550 aarch64_expand_vector_init (rtx target
, rtx vals
)
13552 machine_mode mode
= GET_MODE (target
);
13553 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13554 /* The number of vector elements. */
13555 int n_elts
= XVECLEN (vals
, 0);
13556 /* The number of vector elements which are not constant. */
13558 rtx any_const
= NULL_RTX
;
13559 /* The first element of vals. */
13560 rtx v0
= XVECEXP (vals
, 0, 0);
13561 bool all_same
= true;
13563 /* Count the number of variable elements to initialise. */
13564 for (int i
= 0; i
< n_elts
; ++i
)
13566 rtx x
= XVECEXP (vals
, 0, i
);
13567 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13572 all_same
&= rtx_equal_p (x
, v0
);
13575 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13576 how best to handle this. */
13579 rtx constant
= aarch64_simd_make_constant (vals
);
13580 if (constant
!= NULL_RTX
)
13582 emit_move_insn (target
, constant
);
13587 /* Splat a single non-constant element if we can. */
13590 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13591 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13595 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13596 gcc_assert (icode
!= CODE_FOR_nothing
);
13598 /* If there are only variable elements, try to optimize
13599 the insertion using dup for the most common element
13600 followed by insertions. */
13602 /* The algorithm will fill matches[*][0] with the earliest matching element,
13603 and matches[X][1] with the count of duplicate elements (if X is the
13604 earliest element which has duplicates). */
13606 if (n_var
== n_elts
&& n_elts
<= 16)
13608 int matches
[16][2] = {0};
13609 for (int i
= 0; i
< n_elts
; i
++)
13611 for (int j
= 0; j
<= i
; j
++)
13613 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13621 int maxelement
= 0;
13623 for (int i
= 0; i
< n_elts
; i
++)
13624 if (matches
[i
][1] > maxv
)
13627 maxv
= matches
[i
][1];
13630 /* Create a duplicate of the most common element. */
13631 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13632 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13634 /* Insert the rest. */
13635 for (int i
= 0; i
< n_elts
; i
++)
13637 rtx x
= XVECEXP (vals
, 0, i
);
13638 if (matches
[i
][0] == maxelement
)
13640 x
= copy_to_mode_reg (inner_mode
, x
);
13641 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
13646 /* Initialise a vector which is part-variable. We want to first try
13647 to build those lanes which are constant in the most efficient way we
13649 if (n_var
!= n_elts
)
13651 rtx copy
= copy_rtx (vals
);
13653 /* Load constant part of vector. We really don't care what goes into the
13654 parts we will overwrite, but we're more likely to be able to load the
13655 constant efficiently if it has fewer, larger, repeating parts
13656 (see aarch64_simd_valid_immediate). */
13657 for (int i
= 0; i
< n_elts
; i
++)
13659 rtx x
= XVECEXP (vals
, 0, i
);
13660 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13662 rtx subst
= any_const
;
13663 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
13665 /* Look in the copied vector, as more elements are const. */
13666 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
13667 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
13673 XVECEXP (copy
, 0, i
) = subst
;
13675 aarch64_expand_vector_init (target
, copy
);
13678 /* Insert the variable lanes directly. */
13679 for (int i
= 0; i
< n_elts
; i
++)
13681 rtx x
= XVECEXP (vals
, 0, i
);
13682 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13684 x
= copy_to_mode_reg (inner_mode
, x
);
13685 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
13689 static unsigned HOST_WIDE_INT
13690 aarch64_shift_truncation_mask (machine_mode mode
)
13692 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
13694 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
13697 /* Select a format to encode pointers in exception handling data. */
13699 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
13702 switch (aarch64_cmodel
)
13704 case AARCH64_CMODEL_TINY
:
13705 case AARCH64_CMODEL_TINY_PIC
:
13706 case AARCH64_CMODEL_SMALL
:
13707 case AARCH64_CMODEL_SMALL_PIC
:
13708 case AARCH64_CMODEL_SMALL_SPIC
:
13709 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13711 type
= DW_EH_PE_sdata4
;
13714 /* No assumptions here. 8-byte relocs required. */
13715 type
= DW_EH_PE_sdata8
;
13718 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
13721 /* The last .arch and .tune assembly strings that we printed. */
13722 static std::string aarch64_last_printed_arch_string
;
13723 static std::string aarch64_last_printed_tune_string
;
13725 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
13726 by the function fndecl. */
13729 aarch64_declare_function_name (FILE *stream
, const char* name
,
13732 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13734 struct cl_target_option
*targ_options
;
13736 targ_options
= TREE_TARGET_OPTION (target_parts
);
13738 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
13739 gcc_assert (targ_options
);
13741 const struct processor
*this_arch
13742 = aarch64_get_arch (targ_options
->x_explicit_arch
);
13744 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
13745 std::string extension
13746 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
13748 /* Only update the assembler .arch string if it is distinct from the last
13749 such string we printed. */
13750 std::string to_print
= this_arch
->name
+ extension
;
13751 if (to_print
!= aarch64_last_printed_arch_string
)
13753 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
13754 aarch64_last_printed_arch_string
= to_print
;
13757 /* Print the cpu name we're tuning for in the comments, might be
13758 useful to readers of the generated asm. Do it only when it changes
13759 from function to function and verbose assembly is requested. */
13760 const struct processor
*this_tune
13761 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
13763 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
13765 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
13767 aarch64_last_printed_tune_string
= this_tune
->name
;
13770 /* Don't forget the type directive for ELF. */
13771 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
13772 ASM_OUTPUT_LABEL (stream
, name
);
13775 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
13778 aarch64_start_file (void)
13780 struct cl_target_option
*default_options
13781 = TREE_TARGET_OPTION (target_option_default_node
);
13783 const struct processor
*default_arch
13784 = aarch64_get_arch (default_options
->x_explicit_arch
);
13785 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
13786 std::string extension
13787 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
13788 default_arch
->flags
);
13790 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
13791 aarch64_last_printed_tune_string
= "";
13792 asm_fprintf (asm_out_file
, "\t.arch %s\n",
13793 aarch64_last_printed_arch_string
.c_str ());
13795 default_file_start ();
13798 /* Emit load exclusive. */
13801 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
13802 rtx mem
, rtx model_rtx
)
13804 rtx (*gen
) (rtx
, rtx
, rtx
);
13808 case E_QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
13809 case E_HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
13810 case E_SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
13811 case E_DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
13813 gcc_unreachable ();
13816 emit_insn (gen (rval
, mem
, model_rtx
));
13819 /* Emit store exclusive. */
13822 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
13823 rtx rval
, rtx mem
, rtx model_rtx
)
13825 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
13829 case E_QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
13830 case E_HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
13831 case E_SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
13832 case E_DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
13834 gcc_unreachable ();
13837 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
13840 /* Mark the previous jump instruction as unlikely. */
13843 aarch64_emit_unlikely_jump (rtx insn
)
13845 rtx_insn
*jump
= emit_jump_insn (insn
);
13846 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
13849 /* Expand a compare and swap pattern. */
13852 aarch64_expand_compare_and_swap (rtx operands
[])
13854 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
13855 machine_mode mode
, cmp_mode
;
13856 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
13859 const gen_cas_fn split_cas
[] =
13861 gen_aarch64_compare_and_swapqi
,
13862 gen_aarch64_compare_and_swaphi
,
13863 gen_aarch64_compare_and_swapsi
,
13864 gen_aarch64_compare_and_swapdi
13866 const gen_cas_fn atomic_cas
[] =
13868 gen_aarch64_compare_and_swapqi_lse
,
13869 gen_aarch64_compare_and_swaphi_lse
,
13870 gen_aarch64_compare_and_swapsi_lse
,
13871 gen_aarch64_compare_and_swapdi_lse
13874 bval
= operands
[0];
13875 rval
= operands
[1];
13877 oldval
= operands
[3];
13878 newval
= operands
[4];
13879 is_weak
= operands
[5];
13880 mod_s
= operands
[6];
13881 mod_f
= operands
[7];
13882 mode
= GET_MODE (mem
);
13885 /* Normally the succ memory model must be stronger than fail, but in the
13886 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
13887 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
13889 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
13890 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
13891 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
13897 /* For short modes, we're going to perform the comparison in SImode,
13898 so do the zero-extension now. */
13900 rval
= gen_reg_rtx (SImode
);
13901 oldval
= convert_modes (SImode
, mode
, oldval
, true);
13902 /* Fall through. */
13906 /* Force the value into a register if needed. */
13907 if (!aarch64_plus_operand (oldval
, mode
))
13908 oldval
= force_reg (cmp_mode
, oldval
);
13912 gcc_unreachable ();
13917 case E_QImode
: idx
= 0; break;
13918 case E_HImode
: idx
= 1; break;
13919 case E_SImode
: idx
= 2; break;
13920 case E_DImode
: idx
= 3; break;
13922 gcc_unreachable ();
13925 gen
= atomic_cas
[idx
];
13927 gen
= split_cas
[idx
];
13929 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
13931 if (mode
== QImode
|| mode
== HImode
)
13932 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
13934 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
13935 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
13936 emit_insn (gen_rtx_SET (bval
, x
));
13939 /* Test whether the target supports using a atomic load-operate instruction.
13940 CODE is the operation and AFTER is TRUE if the data in memory after the
13941 operation should be returned and FALSE if the data before the operation
13942 should be returned. Returns FALSE if the operation isn't supported by the
13946 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
13965 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
13966 sequence implementing an atomic operation. */
13969 aarch64_emit_post_barrier (enum memmodel model
)
13971 const enum memmodel base_model
= memmodel_base (model
);
13973 if (is_mm_sync (model
)
13974 && (base_model
== MEMMODEL_ACQUIRE
13975 || base_model
== MEMMODEL_ACQ_REL
13976 || base_model
== MEMMODEL_SEQ_CST
))
13978 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
13982 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
13983 for the data in memory. EXPECTED is the value expected to be in memory.
13984 DESIRED is the value to store to memory. MEM is the memory location. MODEL
13985 is the memory ordering to use. */
13988 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
13989 rtx expected
, rtx desired
,
13992 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
13995 mode
= GET_MODE (mem
);
13999 case E_QImode
: gen
= gen_aarch64_atomic_casqi
; break;
14000 case E_HImode
: gen
= gen_aarch64_atomic_cashi
; break;
14001 case E_SImode
: gen
= gen_aarch64_atomic_cassi
; break;
14002 case E_DImode
: gen
= gen_aarch64_atomic_casdi
; break;
14004 gcc_unreachable ();
14007 /* Move the expected value into the CAS destination register. */
14008 emit_insn (gen_rtx_SET (rval
, expected
));
14010 /* Emit the CAS. */
14011 emit_insn (gen (rval
, mem
, desired
, model
));
14013 /* Compare the expected value with the value loaded by the CAS, to establish
14014 whether the swap was made. */
14015 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14018 /* Split a compare and swap pattern. */
14021 aarch64_split_compare_and_swap (rtx operands
[])
14023 rtx rval
, mem
, oldval
, newval
, scratch
;
14026 rtx_code_label
*label1
, *label2
;
14028 enum memmodel model
;
14031 rval
= operands
[0];
14033 oldval
= operands
[2];
14034 newval
= operands
[3];
14035 is_weak
= (operands
[4] != const0_rtx
);
14036 model_rtx
= operands
[5];
14037 scratch
= operands
[7];
14038 mode
= GET_MODE (mem
);
14039 model
= memmodel_from_int (INTVAL (model_rtx
));
14041 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14044 LD[A]XR rval, [mem]
14046 ST[L]XR scratch, newval, [mem]
14047 CBNZ scratch, .label1
14050 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14055 label1
= gen_label_rtx ();
14056 emit_label (label1
);
14058 label2
= gen_label_rtx ();
14060 /* The initial load can be relaxed for a __sync operation since a final
14061 barrier will be emitted to stop code hoisting. */
14062 if (is_mm_sync (model
))
14063 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14064 GEN_INT (MEMMODEL_RELAXED
));
14066 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14070 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14071 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14072 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14073 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14077 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14078 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14079 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14080 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14081 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14084 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14088 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14089 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14090 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14091 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14095 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14096 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14097 emit_insn (gen_rtx_SET (cond
, x
));
14100 emit_label (label2
);
14101 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14102 to set the condition flags. If this is not used it will be removed by
14106 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14107 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14108 emit_insn (gen_rtx_SET (cond
, x
));
14110 /* Emit any final barrier needed for a __sync operation. */
14111 if (is_mm_sync (model
))
14112 aarch64_emit_post_barrier (model
);
14115 /* Emit a BIC instruction. */
14118 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14120 rtx shift_rtx
= GEN_INT (shift
);
14121 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14125 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14126 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14128 gcc_unreachable ();
14131 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14134 /* Emit an atomic swap. */
14137 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14138 rtx mem
, rtx model
)
14140 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14144 case E_QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
14145 case E_HImode
: gen
= gen_aarch64_atomic_swphi
; break;
14146 case E_SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
14147 case E_DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
14149 gcc_unreachable ();
14152 emit_insn (gen (dst
, mem
, value
, model
));
14155 /* Operations supported by aarch64_emit_atomic_load_op. */
14157 enum aarch64_atomic_load_op_code
14159 AARCH64_LDOP_PLUS
, /* A + B */
14160 AARCH64_LDOP_XOR
, /* A ^ B */
14161 AARCH64_LDOP_OR
, /* A | B */
14162 AARCH64_LDOP_BIC
/* A & ~B */
14165 /* Emit an atomic load-operate. */
14168 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
14169 machine_mode mode
, rtx dst
, rtx src
,
14170 rtx mem
, rtx model
)
14172 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
14173 const aarch64_atomic_load_op_fn plus
[] =
14175 gen_aarch64_atomic_loadaddqi
,
14176 gen_aarch64_atomic_loadaddhi
,
14177 gen_aarch64_atomic_loadaddsi
,
14178 gen_aarch64_atomic_loadadddi
14180 const aarch64_atomic_load_op_fn eor
[] =
14182 gen_aarch64_atomic_loadeorqi
,
14183 gen_aarch64_atomic_loadeorhi
,
14184 gen_aarch64_atomic_loadeorsi
,
14185 gen_aarch64_atomic_loadeordi
14187 const aarch64_atomic_load_op_fn ior
[] =
14189 gen_aarch64_atomic_loadsetqi
,
14190 gen_aarch64_atomic_loadsethi
,
14191 gen_aarch64_atomic_loadsetsi
,
14192 gen_aarch64_atomic_loadsetdi
14194 const aarch64_atomic_load_op_fn bic
[] =
14196 gen_aarch64_atomic_loadclrqi
,
14197 gen_aarch64_atomic_loadclrhi
,
14198 gen_aarch64_atomic_loadclrsi
,
14199 gen_aarch64_atomic_loadclrdi
14201 aarch64_atomic_load_op_fn gen
;
14206 case E_QImode
: idx
= 0; break;
14207 case E_HImode
: idx
= 1; break;
14208 case E_SImode
: idx
= 2; break;
14209 case E_DImode
: idx
= 3; break;
14211 gcc_unreachable ();
14216 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
14217 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
14218 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
14219 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
14221 gcc_unreachable ();
14224 emit_insn (gen (dst
, mem
, src
, model
));
14227 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14228 location to store the data read from memory. OUT_RESULT is the location to
14229 store the result of the operation. MEM is the memory location to read and
14230 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14231 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14235 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14236 rtx mem
, rtx value
, rtx model_rtx
)
14238 machine_mode mode
= GET_MODE (mem
);
14239 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14240 const bool short_mode
= (mode
< SImode
);
14241 aarch64_atomic_load_op_code ldop_code
;
14246 out_data
= gen_lowpart (mode
, out_data
);
14249 out_result
= gen_lowpart (mode
, out_result
);
14251 /* Make sure the value is in a register, putting it into a destination
14252 register if it needs to be manipulated. */
14253 if (!register_operand (value
, mode
)
14254 || code
== AND
|| code
== MINUS
)
14256 src
= out_result
? out_result
: out_data
;
14257 emit_move_insn (src
, gen_lowpart (mode
, value
));
14261 gcc_assert (register_operand (src
, mode
));
14263 /* Preprocess the data for the operation as necessary. If the operation is
14264 a SET then emit a swap instruction and finish. */
14268 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14272 /* Negate the value and treat it as a PLUS. */
14276 /* Resize the value if necessary. */
14278 src
= gen_lowpart (wmode
, src
);
14280 neg_src
= gen_rtx_NEG (wmode
, src
);
14281 emit_insn (gen_rtx_SET (src
, neg_src
));
14284 src
= gen_lowpart (mode
, src
);
14286 /* Fall-through. */
14288 ldop_code
= AARCH64_LDOP_PLUS
;
14292 ldop_code
= AARCH64_LDOP_OR
;
14296 ldop_code
= AARCH64_LDOP_XOR
;
14303 /* Resize the value if necessary. */
14305 src
= gen_lowpart (wmode
, src
);
14307 not_src
= gen_rtx_NOT (wmode
, src
);
14308 emit_insn (gen_rtx_SET (src
, not_src
));
14311 src
= gen_lowpart (mode
, src
);
14313 ldop_code
= AARCH64_LDOP_BIC
;
14317 /* The operation can't be done with atomic instructions. */
14318 gcc_unreachable ();
14321 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
14323 /* If necessary, calculate the data in memory after the update by redoing the
14324 operation from values in registers. */
14330 src
= gen_lowpart (wmode
, src
);
14331 out_data
= gen_lowpart (wmode
, out_data
);
14332 out_result
= gen_lowpart (wmode
, out_result
);
14341 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14344 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14347 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14350 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14353 gcc_unreachable ();
14356 emit_set_insn (out_result
, x
);
14361 /* Split an atomic operation. */
14364 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14365 rtx value
, rtx model_rtx
, rtx cond
)
14367 machine_mode mode
= GET_MODE (mem
);
14368 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14369 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14370 const bool is_sync
= is_mm_sync (model
);
14371 rtx_code_label
*label
;
14374 /* Split the atomic operation into a sequence. */
14375 label
= gen_label_rtx ();
14376 emit_label (label
);
14379 new_out
= gen_lowpart (wmode
, new_out
);
14381 old_out
= gen_lowpart (wmode
, old_out
);
14384 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14386 /* The initial load can be relaxed for a __sync operation since a final
14387 barrier will be emitted to stop code hoisting. */
14389 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14390 GEN_INT (MEMMODEL_RELAXED
));
14392 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14401 x
= gen_rtx_AND (wmode
, old_out
, value
);
14402 emit_insn (gen_rtx_SET (new_out
, x
));
14403 x
= gen_rtx_NOT (wmode
, new_out
);
14404 emit_insn (gen_rtx_SET (new_out
, x
));
14408 if (CONST_INT_P (value
))
14410 value
= GEN_INT (-INTVAL (value
));
14413 /* Fall through. */
14416 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14417 emit_insn (gen_rtx_SET (new_out
, x
));
14421 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14422 gen_lowpart (mode
, new_out
), model_rtx
);
14424 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14425 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14426 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14427 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14429 /* Emit any final barrier needed for a __sync operation. */
14431 aarch64_emit_post_barrier (model
);
14435 aarch64_init_libfuncs (void)
14437 /* Half-precision float operations. The compiler handles all operations
14438 with NULL libfuncs by converting to SFmode. */
14441 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14442 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14445 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14446 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14447 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14448 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14449 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14452 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14453 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14454 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14455 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14456 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14457 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14458 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14461 /* Target hook for c_mode_for_suffix. */
14462 static machine_mode
14463 aarch64_c_mode_for_suffix (char suffix
)
14471 /* We can only represent floating point constants which will fit in
14472 "quarter-precision" values. These values are characterised by
14473 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14476 (-1)^s * (n/16) * 2^r
14479 's' is the sign bit.
14480 'n' is an integer in the range 16 <= n <= 31.
14481 'r' is an integer in the range -3 <= r <= 4. */
14483 /* Return true iff X can be represented by a quarter-precision
14484 floating point immediate operand X. Note, we cannot represent 0.0. */
14486 aarch64_float_const_representable_p (rtx x
)
14488 /* This represents our current view of how many bits
14489 make up the mantissa. */
14490 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14492 unsigned HOST_WIDE_INT mantissa
, mask
;
14493 REAL_VALUE_TYPE r
, m
;
14496 if (!CONST_DOUBLE_P (x
))
14499 /* We don't support HFmode constants yet. */
14500 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
14503 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14505 /* We cannot represent infinities, NaNs or +/-zero. We won't
14506 know if we have +zero until we analyse the mantissa, but we
14507 can reject the other invalid values. */
14508 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14509 || REAL_VALUE_MINUS_ZERO (r
))
14512 /* Extract exponent. */
14513 r
= real_value_abs (&r
);
14514 exponent
= REAL_EXP (&r
);
14516 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14517 highest (sign) bit, with a fixed binary point at bit point_pos.
14518 m1 holds the low part of the mantissa, m2 the high part.
14519 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14520 bits for the mantissa, this can fail (low bits will be lost). */
14521 real_ldexp (&m
, &r
, point_pos
- exponent
);
14522 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14524 /* If the low part of the mantissa has bits set we cannot represent
14526 if (w
.ulow () != 0)
14528 /* We have rejected the lower HOST_WIDE_INT, so update our
14529 understanding of how many bits lie in the mantissa and
14530 look only at the high HOST_WIDE_INT. */
14531 mantissa
= w
.elt (1);
14532 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14534 /* We can only represent values with a mantissa of the form 1.xxxx. */
14535 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14536 if ((mantissa
& mask
) != 0)
14539 /* Having filtered unrepresentable values, we may now remove all
14540 but the highest 5 bits. */
14541 mantissa
>>= point_pos
- 5;
14543 /* We cannot represent the value 0.0, so reject it. This is handled
14548 /* Then, as bit 4 is always set, we can mask it off, leaving
14549 the mantissa in the range [0, 15]. */
14550 mantissa
&= ~(1 << 4);
14551 gcc_assert (mantissa
<= 15);
14553 /* GCC internally does not use IEEE754-like encoding (where normalized
14554 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14555 Our mantissa values are shifted 4 places to the left relative to
14556 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14557 by 5 places to correct for GCC's representation. */
14558 exponent
= 5 - exponent
;
14560 return (exponent
>= 0 && exponent
<= 7);
14563 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14564 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14565 output MOVI/MVNI, ORR or BIC immediate. */
14567 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14568 enum simd_immediate_check which
)
14571 static char templ
[40];
14572 const char *mnemonic
;
14573 const char *shift_op
;
14574 unsigned int lane_count
= 0;
14577 struct simd_immediate_info info
;
14579 /* This will return true to show const_vector is legal for use as either
14580 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14581 It will also update INFO to show how the immediate should be generated.
14582 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14583 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14584 gcc_assert (is_valid
);
14586 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14587 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
14589 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14591 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
14592 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14593 move immediate path. */
14594 if (aarch64_float_const_zero_rtx_p (info
.value
))
14595 info
.value
= GEN_INT (0);
14598 const unsigned int buf_size
= 20;
14599 char float_buf
[buf_size
] = {'\0'};
14600 real_to_decimal_for_mode (float_buf
,
14601 CONST_DOUBLE_REAL_VALUE (info
.value
),
14602 buf_size
, buf_size
, 1, info
.elt_mode
);
14604 if (lane_count
== 1)
14605 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
14607 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
14608 lane_count
, element_char
, float_buf
);
14613 gcc_assert (CONST_INT_P (info
.value
));
14615 if (which
== AARCH64_CHECK_MOV
)
14617 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
14618 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
14619 if (lane_count
== 1)
14620 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
14621 mnemonic
, UINTVAL (info
.value
));
14622 else if (info
.shift
)
14623 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14624 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
14625 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
14627 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14628 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
14629 element_char
, UINTVAL (info
.value
));
14633 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14634 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
14636 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14637 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
14638 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
14640 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14641 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
14642 element_char
, UINTVAL (info
.value
));
14648 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
14651 /* If a floating point number was passed and we desire to use it in an
14652 integer mode do the conversion to integer. */
14653 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
14655 unsigned HOST_WIDE_INT ival
;
14656 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
14657 gcc_unreachable ();
14658 immediate
= gen_int_mode (ival
, mode
);
14661 machine_mode vmode
;
14662 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14663 a 128 bit vector mode. */
14664 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
14666 vmode
= aarch64_simd_container_mode (mode
, width
);
14667 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
14668 return aarch64_output_simd_mov_immediate (v_op
, width
);
14671 /* Return the output string to use for moving immediate CONST_VECTOR
14672 into an SVE register. */
14675 aarch64_output_sve_mov_immediate (rtx const_vector
)
14677 static char templ
[40];
14678 struct simd_immediate_info info
;
14681 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
14682 gcc_assert (is_valid
);
14684 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14688 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
14689 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
14690 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
14694 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14696 if (aarch64_float_const_zero_rtx_p (info
.value
))
14697 info
.value
= GEN_INT (0);
14700 const int buf_size
= 20;
14701 char float_buf
[buf_size
] = {};
14702 real_to_decimal_for_mode (float_buf
,
14703 CONST_DOUBLE_REAL_VALUE (info
.value
),
14704 buf_size
, buf_size
, 1, info
.elt_mode
);
14706 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
14707 element_char
, float_buf
);
14712 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
14713 element_char
, INTVAL (info
.value
));
14717 /* Return the asm format for a PTRUE instruction whose destination has
14718 mode MODE. SUFFIX is the element size suffix. */
14721 aarch64_output_ptrue (machine_mode mode
, char suffix
)
14723 unsigned int nunits
;
14724 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
14725 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
14726 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
14728 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
14732 /* Split operands into moves from op[1] + op[2] into op[0]. */
14735 aarch64_split_combinev16qi (rtx operands
[3])
14737 unsigned int dest
= REGNO (operands
[0]);
14738 unsigned int src1
= REGNO (operands
[1]);
14739 unsigned int src2
= REGNO (operands
[2]);
14740 machine_mode halfmode
= GET_MODE (operands
[1]);
14741 unsigned int halfregs
= REG_NREGS (operands
[1]);
14742 rtx destlo
, desthi
;
14744 gcc_assert (halfmode
== V16QImode
);
14746 if (src1
== dest
&& src2
== dest
+ halfregs
)
14748 /* No-op move. Can't split to nothing; emit something. */
14749 emit_note (NOTE_INSN_DELETED
);
14753 /* Preserve register attributes for variable tracking. */
14754 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
14755 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
14756 GET_MODE_SIZE (halfmode
));
14758 /* Special case of reversed high/low parts. */
14759 if (reg_overlap_mentioned_p (operands
[2], destlo
)
14760 && reg_overlap_mentioned_p (operands
[1], desthi
))
14762 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
14763 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
14764 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
14766 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
14768 /* Try to avoid unnecessary moves if part of the result
14769 is in the right place already. */
14771 emit_move_insn (destlo
, operands
[1]);
14772 if (src2
!= dest
+ halfregs
)
14773 emit_move_insn (desthi
, operands
[2]);
14777 if (src2
!= dest
+ halfregs
)
14778 emit_move_insn (desthi
, operands
[2]);
14780 emit_move_insn (destlo
, operands
[1]);
14784 /* vec_perm support. */
14786 struct expand_vec_perm_d
14788 rtx target
, op0
, op1
;
14789 vec_perm_indices perm
;
14790 machine_mode vmode
;
14791 unsigned int vec_flags
;
14796 /* Generate a variable permutation. */
14799 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
14801 machine_mode vmode
= GET_MODE (target
);
14802 bool one_vector_p
= rtx_equal_p (op0
, op1
);
14804 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
14805 gcc_checking_assert (GET_MODE (op0
) == vmode
);
14806 gcc_checking_assert (GET_MODE (op1
) == vmode
);
14807 gcc_checking_assert (GET_MODE (sel
) == vmode
);
14808 gcc_checking_assert (TARGET_SIMD
);
14812 if (vmode
== V8QImode
)
14814 /* Expand the argument to a V16QI mode by duplicating it. */
14815 rtx pair
= gen_reg_rtx (V16QImode
);
14816 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
14817 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
14821 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
14828 if (vmode
== V8QImode
)
14830 pair
= gen_reg_rtx (V16QImode
);
14831 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
14832 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
14836 pair
= gen_reg_rtx (OImode
);
14837 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
14838 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
14843 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14844 NELT is the number of elements in the vector. */
14847 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
14850 machine_mode vmode
= GET_MODE (target
);
14851 bool one_vector_p
= rtx_equal_p (op0
, op1
);
14854 /* The TBL instruction does not use a modulo index, so we must take care
14855 of that ourselves. */
14856 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
14857 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
14858 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
14860 /* For big-endian, we also need to reverse the index within the vector
14861 (but not which vector). */
14862 if (BYTES_BIG_ENDIAN
)
14864 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
14866 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
14867 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
14868 NULL
, 0, OPTAB_LIB_WIDEN
);
14870 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
14873 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
14876 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
14878 emit_insn (gen_rtx_SET (target
,
14879 gen_rtx_UNSPEC (GET_MODE (target
),
14880 gen_rtvec (2, op0
, op1
), code
)));
14883 /* Expand an SVE vec_perm with the given operands. */
14886 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
14888 machine_mode data_mode
= GET_MODE (target
);
14889 machine_mode sel_mode
= GET_MODE (sel
);
14890 /* Enforced by the pattern condition. */
14891 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
14893 /* Note: vec_perm indices are supposed to wrap when they go beyond the
14894 size of the two value vectors, i.e. the upper bits of the indices
14895 are effectively ignored. SVE TBL instead produces 0 for any
14896 out-of-range indices, so we need to modulo all the vec_perm indices
14897 to ensure they are all in range. */
14898 rtx sel_reg
= force_reg (sel_mode
, sel
);
14900 /* Check if the sel only references the first values vector. */
14901 if (GET_CODE (sel
) == CONST_VECTOR
14902 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
14904 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
14908 /* Check if the two values vectors are the same. */
14909 if (rtx_equal_p (op0
, op1
))
14911 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
14912 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
14913 NULL
, 0, OPTAB_DIRECT
);
14914 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
14918 /* Run TBL on for each value vector and combine the results. */
14920 rtx res0
= gen_reg_rtx (data_mode
);
14921 rtx res1
= gen_reg_rtx (data_mode
);
14922 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
14923 if (GET_CODE (sel
) != CONST_VECTOR
14924 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
14926 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
14928 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
14929 NULL
, 0, OPTAB_DIRECT
);
14931 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
14932 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
14933 NULL
, 0, OPTAB_DIRECT
);
14934 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
14935 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
14936 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
14938 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
14941 /* Recognize patterns suitable for the TRN instructions. */
14943 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
14946 poly_uint64 nelt
= d
->perm
.length ();
14947 rtx out
, in0
, in1
, x
;
14948 machine_mode vmode
= d
->vmode
;
14950 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
14953 /* Note that these are little-endian tests.
14954 We correct for big-endian later. */
14955 if (!d
->perm
[0].is_constant (&odd
)
14956 || (odd
!= 0 && odd
!= 1)
14957 || !d
->perm
.series_p (0, 2, odd
, 2)
14958 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
14967 /* We don't need a big-endian lane correction for SVE; see the comment
14968 at the head of aarch64-sve.md for details. */
14969 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
14971 x
= in0
, in0
= in1
, in1
= x
;
14976 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
14977 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
14981 /* Recognize patterns suitable for the UZP instructions. */
14983 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
14986 rtx out
, in0
, in1
, x
;
14987 machine_mode vmode
= d
->vmode
;
14989 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
14992 /* Note that these are little-endian tests.
14993 We correct for big-endian later. */
14994 if (!d
->perm
[0].is_constant (&odd
)
14995 || (odd
!= 0 && odd
!= 1)
14996 || !d
->perm
.series_p (0, 1, odd
, 2))
15005 /* We don't need a big-endian lane correction for SVE; see the comment
15006 at the head of aarch64-sve.md for details. */
15007 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15009 x
= in0
, in0
= in1
, in1
= x
;
15014 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15015 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15019 /* Recognize patterns suitable for the ZIP instructions. */
15021 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15024 poly_uint64 nelt
= d
->perm
.length ();
15025 rtx out
, in0
, in1
, x
;
15026 machine_mode vmode
= d
->vmode
;
15028 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15031 /* Note that these are little-endian tests.
15032 We correct for big-endian later. */
15033 poly_uint64 first
= d
->perm
[0];
15034 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15035 || !d
->perm
.series_p (0, 2, first
, 1)
15036 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15038 high
= maybe_ne (first
, 0U);
15046 /* We don't need a big-endian lane correction for SVE; see the comment
15047 at the head of aarch64-sve.md for details. */
15048 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15050 x
= in0
, in0
= in1
, in1
= x
;
15055 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15056 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15060 /* Recognize patterns for the EXT insn. */
15063 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15065 HOST_WIDE_INT location
;
15068 /* The first element always refers to the first vector.
15069 Check if the extracted indices are increasing by one. */
15070 if (d
->vec_flags
== VEC_SVE_PRED
15071 || !d
->perm
[0].is_constant (&location
)
15072 || !d
->perm
.series_p (0, 1, location
, 1))
15079 /* The case where (location == 0) is a no-op for both big- and little-endian,
15080 and is removed by the mid-end at optimization levels -O1 and higher.
15082 We don't need a big-endian lane correction for SVE; see the comment
15083 at the head of aarch64-sve.md for details. */
15084 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15086 /* After setup, we want the high elements of the first vector (stored
15087 at the LSB end of the register), and the low elements of the second
15088 vector (stored at the MSB end of the register). So swap. */
15089 std::swap (d
->op0
, d
->op1
);
15090 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15091 to_constant () is safe since this is restricted to Advanced SIMD
15093 location
= d
->perm
.length ().to_constant () - location
;
15096 offset
= GEN_INT (location
);
15097 emit_set_insn (d
->target
,
15098 gen_rtx_UNSPEC (d
->vmode
,
15099 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15104 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15105 within each 64-bit, 32-bit or 16-bit granule. */
15108 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15110 HOST_WIDE_INT diff
;
15111 unsigned int i
, size
, unspec
;
15112 machine_mode pred_mode
;
15114 if (d
->vec_flags
== VEC_SVE_PRED
15115 || !d
->one_vector_p
15116 || !d
->perm
[0].is_constant (&diff
))
15119 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15122 unspec
= UNSPEC_REV64
;
15123 pred_mode
= VNx2BImode
;
15125 else if (size
== 4)
15127 unspec
= UNSPEC_REV32
;
15128 pred_mode
= VNx4BImode
;
15130 else if (size
== 2)
15132 unspec
= UNSPEC_REV16
;
15133 pred_mode
= VNx8BImode
;
15138 unsigned int step
= diff
+ 1;
15139 for (i
= 0; i
< step
; ++i
)
15140 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15147 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15148 if (d
->vec_flags
== VEC_SVE_DATA
)
15150 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15151 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15152 UNSPEC_MERGE_PTRUE
);
15154 emit_set_insn (d
->target
, src
);
15158 /* Recognize patterns for the REV insn, which reverses elements within
15162 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15164 poly_uint64 nelt
= d
->perm
.length ();
15166 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15169 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15176 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15177 emit_set_insn (d
->target
, src
);
15182 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15184 rtx out
= d
->target
;
15187 machine_mode vmode
= d
->vmode
;
15190 if (d
->vec_flags
== VEC_SVE_PRED
15191 || d
->perm
.encoding ().encoded_nelts () != 1
15192 || !d
->perm
[0].is_constant (&elt
))
15195 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15202 /* The generic preparation in aarch64_expand_vec_perm_const_1
15203 swaps the operand order and the permute indices if it finds
15204 d->perm[0] to be in the second operand. Thus, we can always
15205 use d->op0 and need not do any extra arithmetic to get the
15206 correct lane number. */
15208 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15210 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15211 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15212 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15217 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15219 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15220 machine_mode vmode
= d
->vmode
;
15222 /* Make sure that the indices are constant. */
15223 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15224 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15225 if (!d
->perm
[i
].is_constant ())
15231 /* Generic code will try constant permutation twice. Once with the
15232 original mode and again with the elements lowered to QImode.
15233 So wait and don't do the selector expansion ourselves. */
15234 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15237 /* to_constant is safe since this routine is specific to Advanced SIMD
15239 unsigned int nelt
= d
->perm
.length ().to_constant ();
15240 for (unsigned int i
= 0; i
< nelt
; ++i
)
15241 /* If big-endian and two vectors we end up with a weird mixed-endian
15242 mode on NEON. Reverse the index within each word but not the word
15243 itself. to_constant is safe because we checked is_constant above. */
15244 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15245 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15246 : d
->perm
[i
].to_constant ());
15248 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15249 sel
= force_reg (vmode
, sel
);
15251 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15255 /* Try to implement D using an SVE TBL instruction. */
15258 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15260 unsigned HOST_WIDE_INT nelt
;
15262 /* Permuting two variable-length vectors could overflow the
15264 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15270 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15271 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15272 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15277 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15279 /* The pattern matching functions above are written to look for a small
15280 number to begin the sequence (0, 1, N/2). If we begin with an index
15281 from the second operand, we can swap the operands. */
15282 poly_int64 nelt
= d
->perm
.length ();
15283 if (known_ge (d
->perm
[0], nelt
))
15285 d
->perm
.rotate_inputs (1);
15286 std::swap (d
->op0
, d
->op1
);
15289 if ((d
->vec_flags
== VEC_ADVSIMD
15290 || d
->vec_flags
== VEC_SVE_DATA
15291 || d
->vec_flags
== VEC_SVE_PRED
)
15292 && known_gt (nelt
, 1))
15294 if (aarch64_evpc_rev_local (d
))
15296 else if (aarch64_evpc_rev_global (d
))
15298 else if (aarch64_evpc_ext (d
))
15300 else if (aarch64_evpc_dup (d
))
15302 else if (aarch64_evpc_zip (d
))
15304 else if (aarch64_evpc_uzp (d
))
15306 else if (aarch64_evpc_trn (d
))
15308 if (d
->vec_flags
== VEC_SVE_DATA
)
15309 return aarch64_evpc_sve_tbl (d
);
15310 else if (d
->vec_flags
== VEC_SVE_DATA
)
15311 return aarch64_evpc_tbl (d
);
15316 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15319 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15320 rtx op1
, const vec_perm_indices
&sel
)
15322 struct expand_vec_perm_d d
;
15324 /* Check whether the mask can be applied to a single vector. */
15325 if (op0
&& rtx_equal_p (op0
, op1
))
15326 d
.one_vector_p
= true;
15327 else if (sel
.all_from_input_p (0))
15329 d
.one_vector_p
= true;
15332 else if (sel
.all_from_input_p (1))
15334 d
.one_vector_p
= true;
15338 d
.one_vector_p
= false;
15340 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15341 sel
.nelts_per_input ());
15343 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15347 d
.testing_p
= !target
;
15350 return aarch64_expand_vec_perm_const_1 (&d
);
15352 rtx_insn
*last
= get_last_insn ();
15353 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15354 gcc_assert (last
== get_last_insn ());
15359 /* Generate a byte permute mask for a register of mode MODE,
15360 which has NUNITS units. */
15363 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15365 /* We have to reverse each vector because we dont have
15366 a permuted load that can reverse-load according to ABI rules. */
15368 rtvec v
= rtvec_alloc (16);
15370 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15372 gcc_assert (BYTES_BIG_ENDIAN
);
15373 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15375 for (i
= 0; i
< nunits
; i
++)
15376 for (j
= 0; j
< usize
; j
++)
15377 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15378 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15379 return force_reg (V16QImode
, mask
);
15382 /* Return true if X is a valid second operand for the SVE instruction
15383 that implements integer comparison OP_CODE. */
15386 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15388 if (register_operand (x
, VOIDmode
))
15397 return aarch64_sve_cmp_immediate_p (x
, false);
15404 return aarch64_sve_cmp_immediate_p (x
, true);
15406 gcc_unreachable ();
15410 /* Return the UNSPEC_COND_* code for comparison CODE. */
15412 static unsigned int
15413 aarch64_unspec_cond_code (rtx_code code
)
15418 return UNSPEC_COND_NE
;
15420 return UNSPEC_COND_EQ
;
15422 return UNSPEC_COND_LT
;
15424 return UNSPEC_COND_GT
;
15426 return UNSPEC_COND_LE
;
15428 return UNSPEC_COND_GE
;
15430 return UNSPEC_COND_LO
;
15432 return UNSPEC_COND_HI
;
15434 return UNSPEC_COND_LS
;
15436 return UNSPEC_COND_HS
;
15438 return UNSPEC_COND_UO
;
15440 gcc_unreachable ();
15444 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15445 where <X> is the operation associated with comparison CODE. */
15448 aarch64_gen_unspec_cond (rtx_code code
, machine_mode pred_mode
,
15449 rtx pred
, rtx op0
, rtx op1
)
15451 rtvec vec
= gen_rtvec (3, pred
, op0
, op1
);
15452 return gen_rtx_UNSPEC (pred_mode
, vec
, aarch64_unspec_cond_code (code
));
15455 /* Expand an SVE integer comparison:
15457 TARGET = CODE (OP0, OP1). */
15460 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15462 machine_mode pred_mode
= GET_MODE (target
);
15463 machine_mode data_mode
= GET_MODE (op0
);
15465 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15466 op1
= force_reg (data_mode
, op1
);
15468 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15469 rtx unspec
= aarch64_gen_unspec_cond (code
, pred_mode
, ptrue
, op0
, op1
);
15470 emit_insn (gen_set_clobber_cc (target
, unspec
));
15473 /* Emit an instruction:
15475 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15477 where <X> is the operation associated with comparison CODE. */
15480 aarch64_emit_unspec_cond (rtx target
, rtx_code code
, machine_mode pred_mode
,
15481 rtx pred
, rtx op0
, rtx op1
)
15483 rtx unspec
= aarch64_gen_unspec_cond (code
, pred_mode
, pred
, op0
, op1
);
15484 emit_set_insn (target
, unspec
);
15489 (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15490 (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15491 (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15493 where <Xi> is the operation associated with comparison CODEi. */
15496 aarch64_emit_unspec_cond_or (rtx target
, rtx_code code1
, rtx_code code2
,
15497 machine_mode pred_mode
, rtx ptrue
,
15500 rtx tmp1
= gen_reg_rtx (pred_mode
);
15501 aarch64_emit_unspec_cond (tmp1
, code1
, pred_mode
, ptrue
, op0
, op1
);
15502 rtx tmp2
= gen_reg_rtx (pred_mode
);
15503 aarch64_emit_unspec_cond (tmp2
, code2
, pred_mode
, ptrue
, op0
, op1
);
15504 emit_set_insn (target
, gen_rtx_AND (pred_mode
,
15505 gen_rtx_IOR (pred_mode
, tmp1
, tmp2
),
15509 /* If CAN_INVERT_P, emit an instruction:
15511 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15513 where <X> is the operation associated with comparison CODE. Otherwise
15516 (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15517 (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15519 where the second instructions sets TARGET to the inverse of TMP. */
15522 aarch64_emit_inverted_unspec_cond (rtx target
, rtx_code code
,
15523 machine_mode pred_mode
, rtx ptrue
, rtx pred
,
15524 rtx op0
, rtx op1
, bool can_invert_p
)
15527 aarch64_emit_unspec_cond (target
, code
, pred_mode
, pred
, op0
, op1
);
15530 rtx tmp
= gen_reg_rtx (pred_mode
);
15531 aarch64_emit_unspec_cond (tmp
, code
, pred_mode
, pred
, op0
, op1
);
15532 emit_set_insn (target
, gen_rtx_AND (pred_mode
,
15533 gen_rtx_NOT (pred_mode
, tmp
),
15538 /* Expand an SVE floating-point comparison:
15540 TARGET = CODE (OP0, OP1)
15542 If CAN_INVERT_P is true, the caller can also handle inverted results;
15543 return true if the result is in fact inverted. */
15546 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15547 rtx op0
, rtx op1
, bool can_invert_p
)
15549 machine_mode pred_mode
= GET_MODE (target
);
15550 machine_mode data_mode
= GET_MODE (op0
);
15552 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15556 /* UNORDERED has no immediate form. */
15557 op1
= force_reg (data_mode
, op1
);
15558 aarch64_emit_unspec_cond (target
, code
, pred_mode
, ptrue
, op0
, op1
);
15567 /* There is native support for the comparison. */
15568 aarch64_emit_unspec_cond (target
, code
, pred_mode
, ptrue
, op0
, op1
);
15572 /* There is native support for the inverse comparison. */
15573 op1
= force_reg (data_mode
, op1
);
15574 aarch64_emit_inverted_unspec_cond (target
, UNORDERED
,
15575 pred_mode
, ptrue
, ptrue
, op0
, op1
,
15577 return can_invert_p
;
15580 /* This is a trapping operation (LT or GT). */
15581 aarch64_emit_unspec_cond_or (target
, LT
, GT
, pred_mode
, ptrue
, op0
, op1
);
15585 if (!flag_trapping_math
)
15587 /* This would trap for signaling NaNs. */
15588 op1
= force_reg (data_mode
, op1
);
15589 aarch64_emit_unspec_cond_or (target
, UNORDERED
, EQ
,
15590 pred_mode
, ptrue
, op0
, op1
);
15600 rtx ordered
= ptrue
;
15601 if (flag_trapping_math
)
15603 /* Only compare the elements that are known to be ordered. */
15604 ordered
= gen_reg_rtx (pred_mode
);
15605 op1
= force_reg (data_mode
, op1
);
15606 aarch64_emit_inverted_unspec_cond (ordered
, UNORDERED
, pred_mode
,
15607 ptrue
, ptrue
, op0
, op1
, false);
15612 code
= reverse_condition_maybe_unordered (code
);
15613 aarch64_emit_inverted_unspec_cond (target
, code
, pred_mode
, ptrue
,
15614 ordered
, op0
, op1
, can_invert_p
);
15615 return can_invert_p
;
15619 gcc_unreachable ();
15623 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15624 of the data being selected and CMP_MODE is the mode of the values being
15628 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
15631 machine_mode pred_mode
15632 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
15633 GET_MODE_SIZE (cmp_mode
)).require ();
15634 rtx pred
= gen_reg_rtx (pred_mode
);
15635 if (FLOAT_MODE_P (cmp_mode
))
15637 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
15638 ops
[4], ops
[5], true))
15639 std::swap (ops
[1], ops
[2]);
15642 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
15644 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
15645 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
15648 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15649 true. However due to issues with register allocation it is preferable
15650 to avoid tieing integer scalar and FP scalar modes. Executing integer
15651 operations in general registers is better than treating them as scalar
15652 vector operations. This reduces latency and avoids redundant int<->FP
15653 moves. So tie modes if they are either the same class, or vector modes
15654 with other vector modes, vector structs or any scalar mode. */
15657 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
15659 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
15662 /* We specifically want to allow elements of "structure" modes to
15663 be tieable to the structure. This more general condition allows
15664 other rarer situations too. The reason we don't extend this to
15665 predicate modes is that there are no predicate structure modes
15666 nor any specific instructions for extracting part of a predicate
15668 if (aarch64_vector_data_mode_p (mode1
)
15669 && aarch64_vector_data_mode_p (mode2
))
15672 /* Also allow any scalar modes with vectors. */
15673 if (aarch64_vector_mode_supported_p (mode1
)
15674 || aarch64_vector_mode_supported_p (mode2
))
15680 /* Return a new RTX holding the result of moving POINTER forward by
15684 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
15686 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
15688 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
15692 /* Return a new RTX holding the result of moving POINTER forward by the
15693 size of the mode it points to. */
15696 aarch64_progress_pointer (rtx pointer
)
15698 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
15701 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15705 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
15708 rtx reg
= gen_reg_rtx (mode
);
15710 /* "Cast" the pointers to the correct mode. */
15711 *src
= adjust_address (*src
, mode
, 0);
15712 *dst
= adjust_address (*dst
, mode
, 0);
15713 /* Emit the memcpy. */
15714 emit_move_insn (reg
, *src
);
15715 emit_move_insn (*dst
, reg
);
15716 /* Move the pointers forward. */
15717 *src
= aarch64_progress_pointer (*src
);
15718 *dst
= aarch64_progress_pointer (*dst
);
15721 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15722 we succeed, otherwise return false. */
15725 aarch64_expand_movmem (rtx
*operands
)
15728 rtx dst
= operands
[0];
15729 rtx src
= operands
[1];
15731 bool speed_p
= !optimize_function_for_size_p (cfun
);
15733 /* When optimizing for size, give a better estimate of the length of a
15734 memcpy call, but use the default otherwise. */
15735 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
15737 /* We can't do anything smart if the amount to copy is not constant. */
15738 if (!CONST_INT_P (operands
[2]))
15741 n
= UINTVAL (operands
[2]);
15743 /* Try to keep the number of instructions low. For cases below 16 bytes we
15744 need to make at most two moves. For cases above 16 bytes it will be one
15745 move for each 16 byte chunk, then at most two additional moves. */
15746 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
15749 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
15750 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
15752 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
15753 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
15755 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
15761 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
15766 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
15771 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
15772 4-byte chunk, partially overlapping with the previously copied chunk. */
15775 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
15781 src
= aarch64_move_pointer (src
, move
);
15782 dst
= aarch64_move_pointer (dst
, move
);
15783 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
15788 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
15789 them, then (if applicable) an 8-byte chunk. */
15794 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
15799 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
15804 /* Finish the final bytes of the copy. We can always do this in one
15805 instruction. We either copy the exact amount we need, or partially
15806 overlap with the previous chunk we copied and copy 8-bytes. */
15810 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
15812 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
15814 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
15819 src
= aarch64_move_pointer (src
, -1);
15820 dst
= aarch64_move_pointer (dst
, -1);
15821 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
15827 src
= aarch64_move_pointer (src
, move
);
15828 dst
= aarch64_move_pointer (dst
, move
);
15829 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
15836 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15837 SImode stores. Handle the case when the constant has identical
15838 bottom and top halves. This is beneficial when the two stores can be
15839 merged into an STP and we avoid synthesising potentially expensive
15840 immediates twice. Return true if such a split is possible. */
15843 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
15845 rtx lo
= gen_lowpart (SImode
, src
);
15846 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
15848 bool size_p
= optimize_function_for_size_p (cfun
);
15850 if (!rtx_equal_p (lo
, hi
))
15853 unsigned int orig_cost
15854 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
15855 unsigned int lo_cost
15856 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
15858 /* We want to transform:
15860 MOVK x1, 0x140, lsl 16
15861 MOVK x1, 0xc0da, lsl 32
15862 MOVK x1, 0x140, lsl 48
15866 MOVK w1, 0x140, lsl 16
15868 So we want to perform this only when we save two instructions
15869 or more. When optimizing for size, however, accept any code size
15871 if (size_p
&& orig_cost
<= lo_cost
)
15875 && (orig_cost
<= lo_cost
+ 1))
15878 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
15879 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
15882 rtx tmp_reg
= gen_reg_rtx (SImode
);
15883 aarch64_expand_mov_immediate (tmp_reg
, lo
);
15884 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
15885 /* Don't emit an explicit store pair as this may not be always profitable.
15886 Let the sched-fusion logic decide whether to merge them. */
15887 emit_move_insn (mem_lo
, tmp_reg
);
15888 emit_move_insn (mem_hi
, tmp_reg
);
15893 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
15895 static unsigned HOST_WIDE_INT
15896 aarch64_asan_shadow_offset (void)
15898 return (HOST_WIDE_INT_1
<< 36);
15902 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
15903 int code
, tree treeop0
, tree treeop1
)
15905 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
15907 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
15909 struct expand_operand ops
[4];
15912 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
15914 op_mode
= GET_MODE (op0
);
15915 if (op_mode
== VOIDmode
)
15916 op_mode
= GET_MODE (op1
);
15924 icode
= CODE_FOR_cmpsi
;
15929 icode
= CODE_FOR_cmpdi
;
15934 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
15935 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
15940 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
15941 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
15949 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
15950 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
15956 *prep_seq
= get_insns ();
15959 create_fixed_operand (&ops
[0], op0
);
15960 create_fixed_operand (&ops
[1], op1
);
15963 if (!maybe_expand_insn (icode
, 2, ops
))
15968 *gen_seq
= get_insns ();
15971 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
15972 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
15976 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
15977 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
15979 rtx op0
, op1
, target
;
15980 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
15981 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
15983 struct expand_operand ops
[6];
15986 push_to_sequence (*prep_seq
);
15987 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
15989 op_mode
= GET_MODE (op0
);
15990 if (op_mode
== VOIDmode
)
15991 op_mode
= GET_MODE (op1
);
15999 icode
= CODE_FOR_ccmpsi
;
16004 icode
= CODE_FOR_ccmpdi
;
16009 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16010 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16015 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16016 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16024 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16025 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16031 *prep_seq
= get_insns ();
16034 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16035 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16037 if (bit_code
!= AND
)
16039 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16040 GET_MODE (XEXP (prev
, 0))),
16041 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16042 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16045 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16046 create_fixed_operand (&ops
[1], target
);
16047 create_fixed_operand (&ops
[2], op0
);
16048 create_fixed_operand (&ops
[3], op1
);
16049 create_fixed_operand (&ops
[4], prev
);
16050 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16052 push_to_sequence (*gen_seq
);
16053 if (!maybe_expand_insn (icode
, 6, ops
))
16059 *gen_seq
= get_insns ();
16062 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16065 #undef TARGET_GEN_CCMP_FIRST
16066 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16068 #undef TARGET_GEN_CCMP_NEXT
16069 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16071 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16072 instruction fusion of some sort. */
16075 aarch64_macro_fusion_p (void)
16077 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16081 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16082 should be kept together during scheduling. */
16085 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16088 rtx prev_set
= single_set (prev
);
16089 rtx curr_set
= single_set (curr
);
16090 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16091 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16093 if (!aarch64_macro_fusion_p ())
16096 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16098 /* We are trying to match:
16099 prev (mov) == (set (reg r0) (const_int imm16))
16100 curr (movk) == (set (zero_extract (reg r0)
16103 (const_int imm16_1)) */
16105 set_dest
= SET_DEST (curr_set
);
16107 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16108 && CONST_INT_P (SET_SRC (curr_set
))
16109 && CONST_INT_P (SET_SRC (prev_set
))
16110 && CONST_INT_P (XEXP (set_dest
, 2))
16111 && INTVAL (XEXP (set_dest
, 2)) == 16
16112 && REG_P (XEXP (set_dest
, 0))
16113 && REG_P (SET_DEST (prev_set
))
16114 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16120 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16123 /* We're trying to match:
16124 prev (adrp) == (set (reg r1)
16125 (high (symbol_ref ("SYM"))))
16126 curr (add) == (set (reg r0)
16128 (symbol_ref ("SYM"))))
16129 Note that r0 need not necessarily be the same as r1, especially
16130 during pre-regalloc scheduling. */
16132 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16133 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16135 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16136 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16137 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16138 == REGNO (SET_DEST (prev_set
))
16139 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16140 XEXP (SET_SRC (curr_set
), 1)))
16145 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16148 /* We're trying to match:
16149 prev (movk) == (set (zero_extract (reg r0)
16152 (const_int imm16_1))
16153 curr (movk) == (set (zero_extract (reg r0)
16156 (const_int imm16_2)) */
16158 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16159 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16160 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16161 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16162 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16163 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16164 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16165 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16166 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16167 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16168 && CONST_INT_P (SET_SRC (prev_set
))
16169 && CONST_INT_P (SET_SRC (curr_set
)))
16173 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16175 /* We're trying to match:
16176 prev (adrp) == (set (reg r0)
16177 (high (symbol_ref ("SYM"))))
16178 curr (ldr) == (set (reg r1)
16179 (mem (lo_sum (reg r0)
16180 (symbol_ref ("SYM")))))
16182 curr (ldr) == (set (reg r1)
16185 (symbol_ref ("SYM")))))) */
16186 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16187 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16189 rtx curr_src
= SET_SRC (curr_set
);
16191 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16192 curr_src
= XEXP (curr_src
, 0);
16194 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16195 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16196 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16197 == REGNO (SET_DEST (prev_set
))
16198 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16199 XEXP (SET_SRC (prev_set
), 0)))
16204 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16205 && aarch_crypto_can_dual_issue (prev
, curr
))
16208 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16209 && any_condjump_p (curr
))
16211 enum attr_type prev_type
= get_attr_type (prev
);
16213 unsigned int condreg1
, condreg2
;
16215 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16216 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16218 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16220 && modified_in_p (cc_reg_1
, prev
))
16222 /* FIXME: this misses some which is considered simple arthematic
16223 instructions for ThunderX. Simple shifts are missed here. */
16224 if (prev_type
== TYPE_ALUS_SREG
16225 || prev_type
== TYPE_ALUS_IMM
16226 || prev_type
== TYPE_LOGICS_REG
16227 || prev_type
== TYPE_LOGICS_IMM
)
16234 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16235 && any_condjump_p (curr
))
16237 /* We're trying to match:
16238 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16239 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16241 (label_ref ("SYM"))
16243 if (SET_DEST (curr_set
) == (pc_rtx
)
16244 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16245 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16246 && REG_P (SET_DEST (prev_set
))
16247 && REGNO (SET_DEST (prev_set
))
16248 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16250 /* Fuse ALU operations followed by conditional branch instruction. */
16251 switch (get_attr_type (prev
))
16254 case TYPE_ALU_SREG
:
16257 case TYPE_ADCS_REG
:
16258 case TYPE_ADCS_IMM
:
16259 case TYPE_LOGIC_REG
:
16260 case TYPE_LOGIC_IMM
:
16264 case TYPE_SHIFT_REG
:
16265 case TYPE_SHIFT_IMM
:
16280 /* Return true iff the instruction fusion described by OP is enabled. */
16283 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16285 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16288 /* If MEM is in the form of [base+offset], extract the two parts
16289 of address and set to BASE and OFFSET, otherwise return false
16290 after clearing BASE and OFFSET. */
16293 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16297 gcc_assert (MEM_P (mem
));
16299 addr
= XEXP (mem
, 0);
16304 *offset
= const0_rtx
;
16308 if (GET_CODE (addr
) == PLUS
16309 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16311 *base
= XEXP (addr
, 0);
16312 *offset
= XEXP (addr
, 1);
16317 *offset
= NULL_RTX
;
16322 /* Types for scheduling fusion. */
16323 enum sched_fusion_type
16325 SCHED_FUSION_NONE
= 0,
16326 SCHED_FUSION_LD_SIGN_EXTEND
,
16327 SCHED_FUSION_LD_ZERO_EXTEND
,
16333 /* If INSN is a load or store of address in the form of [base+offset],
16334 extract the two parts and set to BASE and OFFSET. Return scheduling
16335 fusion type this INSN is. */
16337 static enum sched_fusion_type
16338 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16341 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16343 gcc_assert (INSN_P (insn
));
16344 x
= PATTERN (insn
);
16345 if (GET_CODE (x
) != SET
)
16346 return SCHED_FUSION_NONE
;
16349 dest
= SET_DEST (x
);
16351 machine_mode dest_mode
= GET_MODE (dest
);
16353 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16354 return SCHED_FUSION_NONE
;
16356 if (GET_CODE (src
) == SIGN_EXTEND
)
16358 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16359 src
= XEXP (src
, 0);
16360 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16361 return SCHED_FUSION_NONE
;
16363 else if (GET_CODE (src
) == ZERO_EXTEND
)
16365 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16366 src
= XEXP (src
, 0);
16367 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16368 return SCHED_FUSION_NONE
;
16371 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16372 extract_base_offset_in_addr (src
, base
, offset
);
16373 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16375 fusion
= SCHED_FUSION_ST
;
16376 extract_base_offset_in_addr (dest
, base
, offset
);
16379 return SCHED_FUSION_NONE
;
16381 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16382 fusion
= SCHED_FUSION_NONE
;
16387 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16389 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16390 and PRI are only calculated for these instructions. For other instruction,
16391 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16392 type instruction fusion can be added by returning different priorities.
16394 It's important that irrelevant instructions get the largest FUSION_PRI. */
16397 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16398 int *fusion_pri
, int *pri
)
16402 enum sched_fusion_type fusion
;
16404 gcc_assert (INSN_P (insn
));
16407 fusion
= fusion_load_store (insn
, &base
, &offset
);
16408 if (fusion
== SCHED_FUSION_NONE
)
16415 /* Set FUSION_PRI according to fusion type and base register. */
16416 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16418 /* Calculate PRI. */
16421 /* INSN with smaller offset goes first. */
16422 off_val
= (int)(INTVAL (offset
));
16424 tmp
-= (off_val
& 0xfffff);
16426 tmp
+= ((- off_val
) & 0xfffff);
16432 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16433 Adjust priority of sha1h instructions so they are scheduled before
16434 other SHA1 instructions. */
16437 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16439 rtx x
= PATTERN (insn
);
16441 if (GET_CODE (x
) == SET
)
16445 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16446 return priority
+ 10;
16452 /* Given OPERANDS of consecutive load/store, check if we can merge
16453 them into ldp/stp. LOAD is true if they are load instructions.
16454 MODE is the mode of memory operands. */
16457 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16460 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16461 enum reg_class rclass_1
, rclass_2
;
16462 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16466 mem_1
= operands
[1];
16467 mem_2
= operands
[3];
16468 reg_1
= operands
[0];
16469 reg_2
= operands
[2];
16470 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16471 if (REGNO (reg_1
) == REGNO (reg_2
))
16476 mem_1
= operands
[0];
16477 mem_2
= operands
[2];
16478 reg_1
= operands
[1];
16479 reg_2
= operands
[3];
16482 /* The mems cannot be volatile. */
16483 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16486 /* If we have SImode and slow unaligned ldp,
16487 check the alignment to be at least 8 byte. */
16489 && (aarch64_tune_params
.extra_tuning_flags
16490 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16492 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16495 /* Check if the addresses are in the form of [base+offset]. */
16496 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16497 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16499 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16500 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16503 /* Check if the bases are same. */
16504 if (!rtx_equal_p (base_1
, base_2
))
16507 offval_1
= INTVAL (offset_1
);
16508 offval_2
= INTVAL (offset_2
);
16509 /* We should only be trying this for fixed-sized modes. There is no
16510 SVE LDP/STP instruction. */
16511 msize
= GET_MODE_SIZE (mode
).to_constant ();
16512 /* Check if the offsets are consecutive. */
16513 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16516 /* Check if the addresses are clobbered by load. */
16519 if (reg_mentioned_p (reg_1
, mem_1
))
16522 /* In increasing order, the last load can clobber the address. */
16523 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16527 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16528 rclass_1
= FP_REGS
;
16530 rclass_1
= GENERAL_REGS
;
16532 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16533 rclass_2
= FP_REGS
;
16535 rclass_2
= GENERAL_REGS
;
16537 /* Check if the registers are of same class. */
16538 if (rclass_1
!= rclass_2
)
16544 /* Given OPERANDS of consecutive load/store, check if we can merge
16545 them into ldp/stp by adjusting the offset. LOAD is true if they
16546 are load instructions. MODE is the mode of memory operands.
16548 Given below consecutive stores:
16550 str w1, [xb, 0x100]
16551 str w1, [xb, 0x104]
16552 str w1, [xb, 0x108]
16553 str w1, [xb, 0x10c]
16555 Though the offsets are out of the range supported by stp, we can
16556 still pair them after adjusting the offset, like:
16558 add scratch, xb, 0x100
16559 stp w1, w1, [scratch]
16560 stp w1, w1, [scratch, 0x8]
16562 The peephole patterns detecting this opportunity should guarantee
16563 the scratch register is avaliable. */
16566 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
16569 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
16570 HOST_WIDE_INT offval_1
, offval_2
, offval_3
, offval_4
, msize
;
16571 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
16572 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
16576 reg_1
= operands
[0];
16577 mem_1
= operands
[1];
16578 reg_2
= operands
[2];
16579 mem_2
= operands
[3];
16580 reg_3
= operands
[4];
16581 mem_3
= operands
[5];
16582 reg_4
= operands
[6];
16583 mem_4
= operands
[7];
16584 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
16585 && REG_P (reg_3
) && REG_P (reg_4
));
16586 if (REGNO (reg_1
) == REGNO (reg_2
) || REGNO (reg_3
) == REGNO (reg_4
))
16591 mem_1
= operands
[0];
16592 reg_1
= operands
[1];
16593 mem_2
= operands
[2];
16594 reg_2
= operands
[3];
16595 mem_3
= operands
[4];
16596 reg_3
= operands
[5];
16597 mem_4
= operands
[6];
16598 reg_4
= operands
[7];
16600 /* Skip if memory operand is by itslef valid for ldp/stp. */
16601 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
16604 /* The mems cannot be volatile. */
16605 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
16606 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
16609 /* Check if the addresses are in the form of [base+offset]. */
16610 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16611 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16613 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16614 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16616 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
16617 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
16619 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
16620 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
16623 /* Check if the bases are same. */
16624 if (!rtx_equal_p (base_1
, base_2
)
16625 || !rtx_equal_p (base_2
, base_3
)
16626 || !rtx_equal_p (base_3
, base_4
))
16629 offval_1
= INTVAL (offset_1
);
16630 offval_2
= INTVAL (offset_2
);
16631 offval_3
= INTVAL (offset_3
);
16632 offval_4
= INTVAL (offset_4
);
16633 msize
= GET_MODE_SIZE (mode
);
16634 /* Check if the offsets are consecutive. */
16635 if ((offval_1
!= (offval_2
+ msize
)
16636 || offval_1
!= (offval_3
+ msize
* 2)
16637 || offval_1
!= (offval_4
+ msize
* 3))
16638 && (offval_4
!= (offval_3
+ msize
)
16639 || offval_4
!= (offval_2
+ msize
* 2)
16640 || offval_4
!= (offval_1
+ msize
* 3)))
16643 /* Check if the addresses are clobbered by load. */
16646 if (reg_mentioned_p (reg_1
, mem_1
)
16647 || reg_mentioned_p (reg_2
, mem_2
)
16648 || reg_mentioned_p (reg_3
, mem_3
))
16651 /* In increasing order, the last load can clobber the address. */
16652 if (offval_1
> offval_2
&& reg_mentioned_p (reg_4
, mem_4
))
16656 /* If we have SImode and slow unaligned ldp,
16657 check the alignment to be at least 8 byte. */
16659 && (aarch64_tune_params
.extra_tuning_flags
16660 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16662 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16665 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16666 rclass_1
= FP_REGS
;
16668 rclass_1
= GENERAL_REGS
;
16670 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16671 rclass_2
= FP_REGS
;
16673 rclass_2
= GENERAL_REGS
;
16675 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
16676 rclass_3
= FP_REGS
;
16678 rclass_3
= GENERAL_REGS
;
16680 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
16681 rclass_4
= FP_REGS
;
16683 rclass_4
= GENERAL_REGS
;
16685 /* Check if the registers are of same class. */
16686 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
16692 /* Given OPERANDS of consecutive load/store, this function pairs them
16693 into ldp/stp after adjusting the offset. It depends on the fact
16694 that addresses of load/store instructions are in increasing order.
16695 MODE is the mode of memory operands. CODE is the rtl operator
16696 which should be applied to all memory operands, it's SIGN_EXTEND,
16697 ZERO_EXTEND or UNKNOWN. */
16700 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
16701 scalar_mode mode
, RTX_CODE code
)
16703 rtx base
, offset
, t1
, t2
;
16704 rtx mem_1
, mem_2
, mem_3
, mem_4
;
16705 HOST_WIDE_INT off_val
, abs_off
, adj_off
, new_off
, stp_off_limit
, msize
;
16709 mem_1
= operands
[1];
16710 mem_2
= operands
[3];
16711 mem_3
= operands
[5];
16712 mem_4
= operands
[7];
16716 mem_1
= operands
[0];
16717 mem_2
= operands
[2];
16718 mem_3
= operands
[4];
16719 mem_4
= operands
[6];
16720 gcc_assert (code
== UNKNOWN
);
16723 extract_base_offset_in_addr (mem_1
, &base
, &offset
);
16724 gcc_assert (base
!= NULL_RTX
&& offset
!= NULL_RTX
);
16726 /* Adjust offset thus it can fit in ldp/stp instruction. */
16727 msize
= GET_MODE_SIZE (mode
);
16728 stp_off_limit
= msize
* 0x40;
16729 off_val
= INTVAL (offset
);
16730 abs_off
= (off_val
< 0) ? -off_val
: off_val
;
16731 new_off
= abs_off
% stp_off_limit
;
16732 adj_off
= abs_off
- new_off
;
16734 /* Further adjust to make sure all offsets are OK. */
16735 if ((new_off
+ msize
* 2) >= stp_off_limit
)
16737 adj_off
+= stp_off_limit
;
16738 new_off
-= stp_off_limit
;
16741 /* Make sure the adjustment can be done with ADD/SUB instructions. */
16742 if (adj_off
>= 0x1000)
16747 adj_off
= -adj_off
;
16748 new_off
= -new_off
;
16751 /* Create new memory references. */
16752 mem_1
= change_address (mem_1
, VOIDmode
,
16753 plus_constant (DImode
, operands
[8], new_off
));
16755 /* Check if the adjusted address is OK for ldp/stp. */
16756 if (!aarch64_mem_pair_operand (mem_1
, mode
))
16759 msize
= GET_MODE_SIZE (mode
);
16760 mem_2
= change_address (mem_2
, VOIDmode
,
16761 plus_constant (DImode
,
16764 mem_3
= change_address (mem_3
, VOIDmode
,
16765 plus_constant (DImode
,
16767 new_off
+ msize
* 2));
16768 mem_4
= change_address (mem_4
, VOIDmode
,
16769 plus_constant (DImode
,
16771 new_off
+ msize
* 3));
16773 if (code
== ZERO_EXTEND
)
16775 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
16776 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
16777 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
16778 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
16780 else if (code
== SIGN_EXTEND
)
16782 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
16783 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
16784 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
16785 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
16790 operands
[1] = mem_1
;
16791 operands
[3] = mem_2
;
16792 operands
[5] = mem_3
;
16793 operands
[7] = mem_4
;
16797 operands
[0] = mem_1
;
16798 operands
[2] = mem_2
;
16799 operands
[4] = mem_3
;
16800 operands
[6] = mem_4
;
16803 /* Emit adjusting instruction. */
16804 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, adj_off
)));
16805 /* Emit ldp/stp instructions. */
16806 t1
= gen_rtx_SET (operands
[0], operands
[1]);
16807 t2
= gen_rtx_SET (operands
[2], operands
[3]);
16808 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
16809 t1
= gen_rtx_SET (operands
[4], operands
[5]);
16810 t2
= gen_rtx_SET (operands
[6], operands
[7]);
16811 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
16815 /* Return 1 if pseudo register should be created and used to hold
16816 GOT address for PIC code. */
16819 aarch64_use_pseudo_pic_reg (void)
16821 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
16824 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
16827 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
16829 switch (XINT (x
, 1))
16831 case UNSPEC_GOTSMALLPIC
:
16832 case UNSPEC_GOTSMALLPIC28K
:
16833 case UNSPEC_GOTTINYPIC
:
16839 return default_unspec_may_trap_p (x
, flags
);
16843 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
16844 return the log2 of that value. Otherwise return -1. */
16847 aarch64_fpconst_pow_of_2 (rtx x
)
16849 const REAL_VALUE_TYPE
*r
;
16851 if (!CONST_DOUBLE_P (x
))
16854 r
= CONST_DOUBLE_REAL_VALUE (x
);
16856 if (REAL_VALUE_NEGATIVE (*r
)
16857 || REAL_VALUE_ISNAN (*r
)
16858 || REAL_VALUE_ISINF (*r
)
16859 || !real_isinteger (r
, DFmode
))
16862 return exact_log2 (real_to_integer (r
));
16865 /* If X is a vector of equal CONST_DOUBLE values and that value is
16866 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
16869 aarch64_vec_fpconst_pow_of_2 (rtx x
)
16872 if (GET_CODE (x
) != CONST_VECTOR
16873 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
16876 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
16879 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
16883 for (int i
= 1; i
< nelts
; i
++)
16884 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
16890 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
16893 __fp16 always promotes through this hook.
16894 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
16895 through the generic excess precision logic rather than here. */
16898 aarch64_promoted_type (const_tree t
)
16900 if (SCALAR_FLOAT_TYPE_P (t
)
16901 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
16902 return float_type_node
;
16907 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
16910 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
16911 optimization_type opt_type
)
16916 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
16923 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
16925 static unsigned int
16926 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
16929 /* Polynomial invariant 1 == (VG / 2) - 1. */
16930 gcc_assert (i
== 1);
16933 return AARCH64_DWARF_VG
;
16936 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
16937 if MODE is HFmode, and punt to the generic implementation otherwise. */
16940 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
16942 return (mode
== HFmode
16944 : default_libgcc_floating_mode_supported_p (mode
));
16947 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
16948 if MODE is HFmode, and punt to the generic implementation otherwise. */
16951 aarch64_scalar_mode_supported_p (scalar_mode mode
)
16953 return (mode
== HFmode
16955 : default_scalar_mode_supported_p (mode
));
16958 /* Set the value of FLT_EVAL_METHOD.
16959 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
16961 0: evaluate all operations and constants, whose semantic type has at
16962 most the range and precision of type float, to the range and
16963 precision of float; evaluate all other operations and constants to
16964 the range and precision of the semantic type;
16966 N, where _FloatN is a supported interchange floating type
16967 evaluate all operations and constants, whose semantic type has at
16968 most the range and precision of _FloatN type, to the range and
16969 precision of the _FloatN type; evaluate all other operations and
16970 constants to the range and precision of the semantic type;
16972 If we have the ARMv8.2-A extensions then we support _Float16 in native
16973 precision, so we should set this to 16. Otherwise, we support the type,
16974 but want to evaluate expressions in float precision, so set this to
16977 static enum flt_eval_method
16978 aarch64_excess_precision (enum excess_precision_type type
)
16982 case EXCESS_PRECISION_TYPE_FAST
:
16983 case EXCESS_PRECISION_TYPE_STANDARD
:
16984 /* We can calculate either in 16-bit range and precision or
16985 32-bit range and precision. Make that decision based on whether
16986 we have native support for the ARMv8.2-A 16-bit floating-point
16987 instructions or not. */
16988 return (TARGET_FP_F16INST
16989 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
16990 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
16991 case EXCESS_PRECISION_TYPE_IMPLICIT
:
16992 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
16994 gcc_unreachable ();
16996 return FLT_EVAL_METHOD_UNPREDICTABLE
;
16999 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17000 scheduled for speculative execution. Reject the long-running division
17001 and square-root instructions. */
17004 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17006 switch (get_attr_type (insn
))
17014 case TYPE_NEON_FP_SQRT_S
:
17015 case TYPE_NEON_FP_SQRT_D
:
17016 case TYPE_NEON_FP_SQRT_S_Q
:
17017 case TYPE_NEON_FP_SQRT_D_Q
:
17018 case TYPE_NEON_FP_DIV_S
:
17019 case TYPE_NEON_FP_DIV_D
:
17020 case TYPE_NEON_FP_DIV_S_Q
:
17021 case TYPE_NEON_FP_DIV_D_Q
:
17028 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17031 aarch64_compute_pressure_classes (reg_class
*classes
)
17034 classes
[i
++] = GENERAL_REGS
;
17035 classes
[i
++] = FP_REGS
;
17036 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17037 registers need to go in PR_LO_REGS at some point during their
17038 lifetime. Splitting it into two halves has the effect of making
17039 all predicates count against PR_LO_REGS, so that we try whenever
17040 possible to restrict the number of live predicates to 8. This
17041 greatly reduces the amount of spilling in certain loops. */
17042 classes
[i
++] = PR_LO_REGS
;
17043 classes
[i
++] = PR_HI_REGS
;
17047 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17050 aarch64_can_change_mode_class (machine_mode from
,
17051 machine_mode to
, reg_class_t
)
17053 /* See the comment at the head of aarch64-sve.md for details. */
17054 if (BYTES_BIG_ENDIAN
17055 && (aarch64_sve_data_mode_p (from
) != aarch64_sve_data_mode_p (to
)))
17060 /* Target-specific selftests. */
17064 namespace selftest
{
17066 /* Selftest for the RTL loader.
17067 Verify that the RTL loader copes with a dump from
17068 print_rtx_function. This is essentially just a test that class
17069 function_reader can handle a real dump, but it also verifies
17070 that lookup_reg_by_dump_name correctly handles hard regs.
17071 The presence of hard reg names in the dump means that the test is
17072 target-specific, hence it is in this file. */
17075 aarch64_test_loading_full_dump ()
17077 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17079 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17081 rtx_insn
*insn_1
= get_insn_by_uid (1);
17082 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17084 rtx_insn
*insn_15
= get_insn_by_uid (15);
17085 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17086 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17088 /* Verify crtl->return_rtx. */
17089 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17090 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17091 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17094 /* Run all target-specific selftests. */
17097 aarch64_run_selftests (void)
17099 aarch64_test_loading_full_dump ();
17102 } // namespace selftest
17104 #endif /* #if CHECKING_P */
17106 #undef TARGET_ADDRESS_COST
17107 #define TARGET_ADDRESS_COST aarch64_address_cost
17109 /* This hook will determines whether unnamed bitfields affect the alignment
17110 of the containing structure. The hook returns true if the structure
17111 should inherit the alignment requirements of an unnamed bitfield's
17113 #undef TARGET_ALIGN_ANON_BITFIELD
17114 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17116 #undef TARGET_ASM_ALIGNED_DI_OP
17117 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17119 #undef TARGET_ASM_ALIGNED_HI_OP
17120 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17122 #undef TARGET_ASM_ALIGNED_SI_OP
17123 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17125 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17126 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17127 hook_bool_const_tree_hwi_hwi_const_tree_true
17129 #undef TARGET_ASM_FILE_START
17130 #define TARGET_ASM_FILE_START aarch64_start_file
17132 #undef TARGET_ASM_OUTPUT_MI_THUNK
17133 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17135 #undef TARGET_ASM_SELECT_RTX_SECTION
17136 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17138 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17139 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17141 #undef TARGET_BUILD_BUILTIN_VA_LIST
17142 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17144 #undef TARGET_CALLEE_COPIES
17145 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17147 #undef TARGET_CAN_ELIMINATE
17148 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17150 #undef TARGET_CAN_INLINE_P
17151 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17153 #undef TARGET_CANNOT_FORCE_CONST_MEM
17154 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17156 #undef TARGET_CASE_VALUES_THRESHOLD
17157 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17159 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17160 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17162 /* Only the least significant bit is used for initialization guard
17164 #undef TARGET_CXX_GUARD_MASK_BIT
17165 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17167 #undef TARGET_C_MODE_FOR_SUFFIX
17168 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17170 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17171 #undef TARGET_DEFAULT_TARGET_FLAGS
17172 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17175 #undef TARGET_CLASS_MAX_NREGS
17176 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17178 #undef TARGET_BUILTIN_DECL
17179 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17181 #undef TARGET_BUILTIN_RECIPROCAL
17182 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17184 #undef TARGET_C_EXCESS_PRECISION
17185 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17187 #undef TARGET_EXPAND_BUILTIN
17188 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17190 #undef TARGET_EXPAND_BUILTIN_VA_START
17191 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17193 #undef TARGET_FOLD_BUILTIN
17194 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17196 #undef TARGET_FUNCTION_ARG
17197 #define TARGET_FUNCTION_ARG aarch64_function_arg
17199 #undef TARGET_FUNCTION_ARG_ADVANCE
17200 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17202 #undef TARGET_FUNCTION_ARG_BOUNDARY
17203 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17205 #undef TARGET_FUNCTION_ARG_PADDING
17206 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17208 #undef TARGET_GET_RAW_RESULT_MODE
17209 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17210 #undef TARGET_GET_RAW_ARG_MODE
17211 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17213 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17214 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17216 #undef TARGET_FUNCTION_VALUE
17217 #define TARGET_FUNCTION_VALUE aarch64_function_value
17219 #undef TARGET_FUNCTION_VALUE_REGNO_P
17220 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17222 #undef TARGET_GIMPLE_FOLD_BUILTIN
17223 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17225 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17226 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17228 #undef TARGET_INIT_BUILTINS
17229 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17231 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17232 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17233 aarch64_ira_change_pseudo_allocno_class
17235 #undef TARGET_LEGITIMATE_ADDRESS_P
17236 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17238 #undef TARGET_LEGITIMATE_CONSTANT_P
17239 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17241 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17242 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17243 aarch64_legitimize_address_displacement
17245 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17246 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17248 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17249 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17250 aarch64_libgcc_floating_mode_supported_p
17252 #undef TARGET_MANGLE_TYPE
17253 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17255 #undef TARGET_MEMORY_MOVE_COST
17256 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17258 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17259 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17261 #undef TARGET_MUST_PASS_IN_STACK
17262 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17264 /* This target hook should return true if accesses to volatile bitfields
17265 should use the narrowest mode possible. It should return false if these
17266 accesses should use the bitfield container type. */
17267 #undef TARGET_NARROW_VOLATILE_BITFIELD
17268 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17270 #undef TARGET_OPTION_OVERRIDE
17271 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17273 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17274 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17275 aarch64_override_options_after_change
17277 #undef TARGET_OPTION_SAVE
17278 #define TARGET_OPTION_SAVE aarch64_option_save
17280 #undef TARGET_OPTION_RESTORE
17281 #define TARGET_OPTION_RESTORE aarch64_option_restore
17283 #undef TARGET_OPTION_PRINT
17284 #define TARGET_OPTION_PRINT aarch64_option_print
17286 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17287 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17289 #undef TARGET_SET_CURRENT_FUNCTION
17290 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17292 #undef TARGET_PASS_BY_REFERENCE
17293 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17295 #undef TARGET_PREFERRED_RELOAD_CLASS
17296 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17298 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17299 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17301 #undef TARGET_PROMOTED_TYPE
17302 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17304 #undef TARGET_SECONDARY_RELOAD
17305 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17307 #undef TARGET_SHIFT_TRUNCATION_MASK
17308 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17310 #undef TARGET_SETUP_INCOMING_VARARGS
17311 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17313 #undef TARGET_STRUCT_VALUE_RTX
17314 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17316 #undef TARGET_REGISTER_MOVE_COST
17317 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17319 #undef TARGET_RETURN_IN_MEMORY
17320 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17322 #undef TARGET_RETURN_IN_MSB
17323 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17325 #undef TARGET_RTX_COSTS
17326 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17328 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17329 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17331 #undef TARGET_SCHED_ISSUE_RATE
17332 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17334 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17335 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17336 aarch64_sched_first_cycle_multipass_dfa_lookahead
17338 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17339 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17340 aarch64_first_cycle_multipass_dfa_lookahead_guard
17342 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17343 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17344 aarch64_get_separate_components
17346 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17347 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17348 aarch64_components_for_bb
17350 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17351 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17352 aarch64_disqualify_components
17354 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17355 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17356 aarch64_emit_prologue_components
17358 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17359 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17360 aarch64_emit_epilogue_components
17362 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17363 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17364 aarch64_set_handled_components
17366 #undef TARGET_TRAMPOLINE_INIT
17367 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17369 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17370 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17372 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17373 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17375 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17376 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17377 aarch64_builtin_support_vector_misalignment
17379 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17380 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17382 #undef TARGET_VECTORIZE_ADD_STMT_COST
17383 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17385 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17386 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17387 aarch64_builtin_vectorization_cost
17389 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17390 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17392 #undef TARGET_VECTORIZE_BUILTINS
17393 #define TARGET_VECTORIZE_BUILTINS
17395 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17396 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17397 aarch64_builtin_vectorized_function
17399 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17400 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17401 aarch64_autovectorize_vector_sizes
17403 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17404 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17405 aarch64_atomic_assign_expand_fenv
17407 /* Section anchor support. */
17409 #undef TARGET_MIN_ANCHOR_OFFSET
17410 #define TARGET_MIN_ANCHOR_OFFSET -256
17412 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17413 byte offset; we can do much more for larger data types, but have no way
17414 to determine the size of the access. We assume accesses are aligned. */
17415 #undef TARGET_MAX_ANCHOR_OFFSET
17416 #define TARGET_MAX_ANCHOR_OFFSET 4095
17418 #undef TARGET_VECTOR_ALIGNMENT
17419 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17421 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17422 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17423 aarch64_vectorize_preferred_vector_alignment
17424 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17425 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17426 aarch64_simd_vector_alignment_reachable
17428 /* vec_perm support. */
17430 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17431 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17432 aarch64_vectorize_vec_perm_const
17434 #undef TARGET_VECTORIZE_GET_MASK_MODE
17435 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17437 #undef TARGET_INIT_LIBFUNCS
17438 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17440 #undef TARGET_FIXED_CONDITION_CODE_REGS
17441 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17443 #undef TARGET_FLAGS_REGNUM
17444 #define TARGET_FLAGS_REGNUM CC_REGNUM
17446 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17447 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17449 #undef TARGET_ASAN_SHADOW_OFFSET
17450 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17452 #undef TARGET_LEGITIMIZE_ADDRESS
17453 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17455 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17456 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17458 #undef TARGET_CAN_USE_DOLOOP_P
17459 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17461 #undef TARGET_SCHED_ADJUST_PRIORITY
17462 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17464 #undef TARGET_SCHED_MACRO_FUSION_P
17465 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17467 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17468 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17470 #undef TARGET_SCHED_FUSION_PRIORITY
17471 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17473 #undef TARGET_UNSPEC_MAY_TRAP_P
17474 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17476 #undef TARGET_USE_PSEUDO_PIC_REG
17477 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17479 #undef TARGET_PRINT_OPERAND
17480 #define TARGET_PRINT_OPERAND aarch64_print_operand
17482 #undef TARGET_PRINT_OPERAND_ADDRESS
17483 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17485 #undef TARGET_OPTAB_SUPPORTED_P
17486 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17488 #undef TARGET_OMIT_STRUCT_RETURN_REG
17489 #define TARGET_OMIT_STRUCT_RETURN_REG true
17491 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17492 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17493 aarch64_dwarf_poly_indeterminate_value
17495 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17496 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17497 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17499 #undef TARGET_HARD_REGNO_NREGS
17500 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17501 #undef TARGET_HARD_REGNO_MODE_OK
17502 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17504 #undef TARGET_MODES_TIEABLE_P
17505 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17507 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17508 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17509 aarch64_hard_regno_call_part_clobbered
17511 #undef TARGET_CONSTANT_ALIGNMENT
17512 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17514 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17515 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17517 #undef TARGET_CAN_CHANGE_MODE_CLASS
17518 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17521 #undef TARGET_RUN_TARGET_SELFTESTS
17522 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17523 #endif /* #if CHECKING_P */
17525 struct gcc_target targetm
= TARGET_INITIALIZER
;
17527 #include "gt-aarch64.h"