1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
84 A simple base register plus immediate offset.
87 A base register indexed by immediate offset with writeback.
90 A base register indexed by (optionally scaled) register.
93 A base register indexed by (optionally scaled) zero-extended register.
96 A base register indexed by (optionally scaled) sign-extended register.
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type
{
114 struct aarch64_address_info
{
115 enum aarch64_address_type type
;
118 poly_int64 const_offset
;
120 enum aarch64_symbol_type symbol_type
;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type
{ MOV
, MVN
};
127 enum modifier_type
{ LSL
, MSL
};
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode
, rtx
);
131 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
132 insn_type
= MOV
, modifier_type
= LSL
,
134 simd_immediate_info (scalar_mode
, rtx
, rtx
);
136 /* The mode of the elements. */
137 scalar_mode elt_mode
;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
143 /* The value of the step if the constant is a series, null otherwise. */
146 /* The instruction to use to move the immediate into a vector. */
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier
;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
159 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
160 modifier (LSL
), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
168 unsigned HOST_WIDE_INT value_in
,
169 insn_type insn_in
, modifier_type modifier_in
,
170 unsigned int shift_in
)
171 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
172 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
179 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
180 modifier (LSL
), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel
;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg
;
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
194 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
197 machine_mode
*, int *,
199 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
200 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode
);
203 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
208 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode
, rtx
);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version
;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune
= cortexa53
;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags
= 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads
;
223 /* Global flag for whether frame pointer is enabled. */
224 bool aarch64_use_frame_pointer
;
226 /* Support for command line parsing of boolean flags in the tuning
228 struct aarch64_flag_desc
234 #define AARCH64_FUSION_PAIR(name, internal_name) \
235 { name, AARCH64_FUSE_##internal_name },
236 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
238 { "none", AARCH64_FUSE_NOTHING
},
239 #include "aarch64-fusion-pairs.def"
240 { "all", AARCH64_FUSE_ALL
},
241 { NULL
, AARCH64_FUSE_NOTHING
}
244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
245 { name, AARCH64_EXTRA_TUNE_##internal_name },
246 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
248 { "none", AARCH64_EXTRA_TUNE_NONE
},
249 #include "aarch64-tuning-flags.def"
250 { "all", AARCH64_EXTRA_TUNE_ALL
},
251 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
254 /* Tuning parameters. */
256 static const struct cpu_addrcost_table generic_addrcost_table
=
266 0, /* register_offset */
267 0, /* register_sextend */
268 0, /* register_zextend */
272 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
282 1, /* register_offset */
283 1, /* register_sextend */
284 2, /* register_zextend */
288 static const struct cpu_addrcost_table xgene1_addrcost_table
=
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
314 2, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
320 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
330 3, /* register_offset */
331 4, /* register_sextend */
332 3, /* register_zextend */
336 static const struct cpu_regmove_cost generic_regmove_cost
=
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
346 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
356 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
366 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
376 static const struct cpu_regmove_cost thunderx_regmove_cost
=
384 static const struct cpu_regmove_cost xgene1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
397 /* Avoid the use of int<->fp moves for spilling. */
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
406 /* Avoid the use of int<->fp moves for spilling. */
412 /* Generic costs for vector insn classes. */
413 static const struct cpu_vector_cost generic_vector_cost
=
415 1, /* scalar_int_stmt_cost */
416 1, /* scalar_fp_stmt_cost */
417 1, /* scalar_load_cost */
418 1, /* scalar_store_cost */
419 1, /* vec_int_stmt_cost */
420 1, /* vec_fp_stmt_cost */
421 2, /* vec_permute_cost */
422 1, /* vec_to_scalar_cost */
423 1, /* scalar_to_vec_cost */
424 1, /* vec_align_load_cost */
425 1, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 3, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* ThunderX costs for vector insn classes. */
433 static const struct cpu_vector_cost thunderx_vector_cost
=
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 3, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 4, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 4, /* vec_permute_cost */
442 2, /* vec_to_scalar_cost */
443 2, /* scalar_to_vec_cost */
444 3, /* vec_align_load_cost */
445 5, /* vec_unalign_load_cost */
446 5, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 3 /* cond_not_taken_branch_cost */
452 /* Generic costs for vector insn classes. */
453 static const struct cpu_vector_cost cortexa57_vector_cost
=
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 4, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 2, /* vec_int_stmt_cost */
460 2, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 8, /* vec_to_scalar_cost */
463 8, /* scalar_to_vec_cost */
464 4, /* vec_align_load_cost */
465 4, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 static const struct cpu_vector_cost exynosm1_vector_cost
=
474 1, /* scalar_int_stmt_cost */
475 1, /* scalar_fp_stmt_cost */
476 5, /* scalar_load_cost */
477 1, /* scalar_store_cost */
478 3, /* vec_int_stmt_cost */
479 3, /* vec_fp_stmt_cost */
480 3, /* vec_permute_cost */
481 3, /* vec_to_scalar_cost */
482 3, /* scalar_to_vec_cost */
483 5, /* vec_align_load_cost */
484 5, /* vec_unalign_load_cost */
485 1, /* vec_unalign_store_cost */
486 1, /* vec_store_cost */
487 1, /* cond_taken_branch_cost */
488 1 /* cond_not_taken_branch_cost */
491 /* Generic costs for vector insn classes. */
492 static const struct cpu_vector_cost xgene1_vector_cost
=
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 4, /* vec_to_scalar_cost */
502 4, /* scalar_to_vec_cost */
503 10, /* vec_align_load_cost */
504 10, /* vec_unalign_load_cost */
505 2, /* vec_unalign_store_cost */
506 2, /* vec_store_cost */
507 2, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
511 /* Costs for vector insn classes for Vulcan. */
512 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
514 1, /* scalar_int_stmt_cost */
515 6, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 5, /* vec_int_stmt_cost */
519 6, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 6, /* vec_to_scalar_cost */
522 5, /* scalar_to_vec_cost */
523 8, /* vec_align_load_cost */
524 8, /* vec_unalign_load_cost */
525 4, /* vec_unalign_store_cost */
526 4, /* vec_store_cost */
527 2, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
531 /* Generic costs for branch instructions. */
532 static const struct cpu_branch_cost generic_branch_cost
=
534 1, /* Predictable. */
535 3 /* Unpredictable. */
538 /* Generic approximation modes. */
539 static const cpu_approx_modes generic_approx_modes
=
541 AARCH64_APPROX_NONE
, /* division */
542 AARCH64_APPROX_NONE
, /* sqrt */
543 AARCH64_APPROX_NONE
/* recip_sqrt */
546 /* Approximation modes for Exynos M1. */
547 static const cpu_approx_modes exynosm1_approx_modes
=
549 AARCH64_APPROX_NONE
, /* division */
550 AARCH64_APPROX_ALL
, /* sqrt */
551 AARCH64_APPROX_ALL
/* recip_sqrt */
554 /* Approximation modes for X-Gene 1. */
555 static const cpu_approx_modes xgene1_approx_modes
=
557 AARCH64_APPROX_NONE
, /* division */
558 AARCH64_APPROX_NONE
, /* sqrt */
559 AARCH64_APPROX_ALL
/* recip_sqrt */
562 /* Generic prefetch settings (which disable prefetch). */
563 static const cpu_prefetch_tune generic_prefetch_tune
=
566 -1, /* l1_cache_size */
567 -1, /* l1_cache_line_size */
568 -1, /* l2_cache_size */
569 true, /* prefetch_dynamic_strides */
570 -1, /* minimum_stride */
571 -1 /* default_opt_level */
574 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
577 -1, /* l1_cache_size */
578 64, /* l1_cache_line_size */
579 -1, /* l2_cache_size */
580 true, /* prefetch_dynamic_strides */
581 -1, /* minimum_stride */
582 -1 /* default_opt_level */
585 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
588 32, /* l1_cache_size */
589 64, /* l1_cache_line_size */
590 512, /* l2_cache_size */
591 false, /* prefetch_dynamic_strides */
592 2048, /* minimum_stride */
593 3 /* default_opt_level */
596 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
599 32, /* l1_cache_size */
600 128, /* l1_cache_line_size */
601 16*1024, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 3 /* default_opt_level */
607 static const cpu_prefetch_tune thunderx_prefetch_tune
=
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 256, /* l2_cache_size */
624 true, /* prefetch_dynamic_strides */
625 -1, /* minimum_stride */
626 -1 /* default_opt_level */
629 static const struct tune_params generic_tunings
=
631 &cortexa57_extra_costs
,
632 &generic_addrcost_table
,
633 &generic_regmove_cost
,
634 &generic_vector_cost
,
635 &generic_branch_cost
,
636 &generic_approx_modes
,
639 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
640 "8", /* function_align. */
641 "4", /* jump_align. */
642 "8", /* loop_align. */
643 2, /* int_reassoc_width. */
644 4, /* fp_reassoc_width. */
645 1, /* vec_reassoc_width. */
646 2, /* min_div_recip_mul_sf. */
647 2, /* min_div_recip_mul_df. */
648 0, /* max_case_values. */
649 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
650 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
651 &generic_prefetch_tune
654 static const struct tune_params cortexa35_tunings
=
656 &cortexa53_extra_costs
,
657 &generic_addrcost_table
,
658 &cortexa53_regmove_cost
,
659 &generic_vector_cost
,
660 &generic_branch_cost
,
661 &generic_approx_modes
,
664 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
665 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
666 "16", /* function_align. */
667 "4", /* jump_align. */
668 "8", /* loop_align. */
669 2, /* int_reassoc_width. */
670 4, /* fp_reassoc_width. */
671 1, /* vec_reassoc_width. */
672 2, /* min_div_recip_mul_sf. */
673 2, /* min_div_recip_mul_df. */
674 0, /* max_case_values. */
675 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
676 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
677 &generic_prefetch_tune
680 static const struct tune_params cortexa53_tunings
=
682 &cortexa53_extra_costs
,
683 &generic_addrcost_table
,
684 &cortexa53_regmove_cost
,
685 &generic_vector_cost
,
686 &generic_branch_cost
,
687 &generic_approx_modes
,
690 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
691 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
692 "16", /* function_align. */
693 "4", /* jump_align. */
694 "8", /* loop_align. */
695 2, /* int_reassoc_width. */
696 4, /* fp_reassoc_width. */
697 1, /* vec_reassoc_width. */
698 2, /* min_div_recip_mul_sf. */
699 2, /* min_div_recip_mul_df. */
700 0, /* max_case_values. */
701 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
702 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
703 &generic_prefetch_tune
706 static const struct tune_params cortexa57_tunings
=
708 &cortexa57_extra_costs
,
709 &generic_addrcost_table
,
710 &cortexa57_regmove_cost
,
711 &cortexa57_vector_cost
,
712 &generic_branch_cost
,
713 &generic_approx_modes
,
716 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
717 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
718 "16", /* function_align. */
719 "4", /* jump_align. */
720 "8", /* loop_align. */
721 2, /* int_reassoc_width. */
722 4, /* fp_reassoc_width. */
723 1, /* vec_reassoc_width. */
724 2, /* min_div_recip_mul_sf. */
725 2, /* min_div_recip_mul_df. */
726 0, /* max_case_values. */
727 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
728 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
729 &generic_prefetch_tune
732 static const struct tune_params cortexa72_tunings
=
734 &cortexa57_extra_costs
,
735 &generic_addrcost_table
,
736 &cortexa57_regmove_cost
,
737 &cortexa57_vector_cost
,
738 &generic_branch_cost
,
739 &generic_approx_modes
,
742 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
743 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
744 "16", /* function_align. */
745 "4", /* jump_align. */
746 "8", /* loop_align. */
747 2, /* int_reassoc_width. */
748 4, /* fp_reassoc_width. */
749 1, /* vec_reassoc_width. */
750 2, /* min_div_recip_mul_sf. */
751 2, /* min_div_recip_mul_df. */
752 0, /* max_case_values. */
753 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
754 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
755 &generic_prefetch_tune
758 static const struct tune_params cortexa73_tunings
=
760 &cortexa57_extra_costs
,
761 &generic_addrcost_table
,
762 &cortexa57_regmove_cost
,
763 &cortexa57_vector_cost
,
764 &generic_branch_cost
,
765 &generic_approx_modes
,
766 4, /* memmov_cost. */
768 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
769 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
770 "16", /* function_align. */
771 "4", /* jump_align. */
772 "8", /* loop_align. */
773 2, /* int_reassoc_width. */
774 4, /* fp_reassoc_width. */
775 1, /* vec_reassoc_width. */
776 2, /* min_div_recip_mul_sf. */
777 2, /* min_div_recip_mul_df. */
778 0, /* max_case_values. */
779 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
780 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
781 &generic_prefetch_tune
786 static const struct tune_params exynosm1_tunings
=
788 &exynosm1_extra_costs
,
789 &exynosm1_addrcost_table
,
790 &exynosm1_regmove_cost
,
791 &exynosm1_vector_cost
,
792 &generic_branch_cost
,
793 &exynosm1_approx_modes
,
796 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
797 "4", /* function_align. */
798 "4", /* jump_align. */
799 "4", /* loop_align. */
800 2, /* int_reassoc_width. */
801 4, /* fp_reassoc_width. */
802 1, /* vec_reassoc_width. */
803 2, /* min_div_recip_mul_sf. */
804 2, /* min_div_recip_mul_df. */
805 48, /* max_case_values. */
806 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
808 &exynosm1_prefetch_tune
811 static const struct tune_params thunderxt88_tunings
=
813 &thunderx_extra_costs
,
814 &generic_addrcost_table
,
815 &thunderx_regmove_cost
,
816 &thunderx_vector_cost
,
817 &generic_branch_cost
,
818 &generic_approx_modes
,
821 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
822 "8", /* function_align. */
823 "8", /* jump_align. */
824 "8", /* loop_align. */
825 2, /* int_reassoc_width. */
826 4, /* fp_reassoc_width. */
827 1, /* vec_reassoc_width. */
828 2, /* min_div_recip_mul_sf. */
829 2, /* min_div_recip_mul_df. */
830 0, /* max_case_values. */
831 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
832 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
833 &thunderxt88_prefetch_tune
836 static const struct tune_params thunderx_tunings
=
838 &thunderx_extra_costs
,
839 &generic_addrcost_table
,
840 &thunderx_regmove_cost
,
841 &thunderx_vector_cost
,
842 &generic_branch_cost
,
843 &generic_approx_modes
,
846 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
847 "8", /* function_align. */
848 "8", /* jump_align. */
849 "8", /* loop_align. */
850 2, /* int_reassoc_width. */
851 4, /* fp_reassoc_width. */
852 1, /* vec_reassoc_width. */
853 2, /* min_div_recip_mul_sf. */
854 2, /* min_div_recip_mul_df. */
855 0, /* max_case_values. */
856 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
857 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
858 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
859 &thunderx_prefetch_tune
862 static const struct tune_params xgene1_tunings
=
865 &xgene1_addrcost_table
,
866 &xgene1_regmove_cost
,
868 &generic_branch_cost
,
869 &xgene1_approx_modes
,
872 AARCH64_FUSE_NOTHING
, /* fusible_ops */
873 "16", /* function_align. */
874 "8", /* jump_align. */
875 "16", /* loop_align. */
876 2, /* int_reassoc_width. */
877 4, /* fp_reassoc_width. */
878 1, /* vec_reassoc_width. */
879 2, /* min_div_recip_mul_sf. */
880 2, /* min_div_recip_mul_df. */
881 0, /* max_case_values. */
882 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
883 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
884 &generic_prefetch_tune
887 static const struct tune_params qdf24xx_tunings
=
889 &qdf24xx_extra_costs
,
890 &qdf24xx_addrcost_table
,
891 &qdf24xx_regmove_cost
,
892 &generic_vector_cost
,
893 &generic_branch_cost
,
894 &generic_approx_modes
,
897 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
898 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
899 "16", /* function_align. */
900 "8", /* jump_align. */
901 "16", /* loop_align. */
902 2, /* int_reassoc_width. */
903 4, /* fp_reassoc_width. */
904 1, /* vec_reassoc_width. */
905 2, /* min_div_recip_mul_sf. */
906 2, /* min_div_recip_mul_df. */
907 0, /* max_case_values. */
908 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
909 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
910 &qdf24xx_prefetch_tune
913 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
915 static const struct tune_params saphira_tunings
=
917 &generic_extra_costs
,
918 &generic_addrcost_table
,
919 &generic_regmove_cost
,
920 &generic_vector_cost
,
921 &generic_branch_cost
,
922 &generic_approx_modes
,
925 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
926 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
927 "16", /* function_align. */
928 "8", /* jump_align. */
929 "16", /* loop_align. */
930 2, /* int_reassoc_width. */
931 4, /* fp_reassoc_width. */
932 1, /* vec_reassoc_width. */
933 2, /* min_div_recip_mul_sf. */
934 2, /* min_div_recip_mul_df. */
935 0, /* max_case_values. */
936 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
937 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
938 &generic_prefetch_tune
941 static const struct tune_params thunderx2t99_tunings
=
943 &thunderx2t99_extra_costs
,
944 &thunderx2t99_addrcost_table
,
945 &thunderx2t99_regmove_cost
,
946 &thunderx2t99_vector_cost
,
947 &generic_branch_cost
,
948 &generic_approx_modes
,
949 4, /* memmov_cost. */
951 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
952 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
953 "16", /* function_align. */
954 "8", /* jump_align. */
955 "16", /* loop_align. */
956 3, /* int_reassoc_width. */
957 2, /* fp_reassoc_width. */
958 2, /* vec_reassoc_width. */
959 2, /* min_div_recip_mul_sf. */
960 2, /* min_div_recip_mul_df. */
961 0, /* max_case_values. */
962 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
963 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
964 &thunderx2t99_prefetch_tune
967 /* Support for fine-grained override of the tuning structures. */
968 struct aarch64_tuning_override_function
971 void (*parse_override
)(const char*, struct tune_params
*);
974 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
975 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
977 static const struct aarch64_tuning_override_function
978 aarch64_tuning_override_functions
[] =
980 { "fuse", aarch64_parse_fuse_string
},
981 { "tune", aarch64_parse_tune_string
},
985 /* A processor implementing AArch64. */
988 const char *const name
;
989 enum aarch64_processor ident
;
990 enum aarch64_processor sched_core
;
991 enum aarch64_arch arch
;
992 unsigned architecture_version
;
993 const unsigned long flags
;
994 const struct tune_params
*const tune
;
997 /* Architectures implementing AArch64. */
998 static const struct processor all_architectures
[] =
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1006 /* Processor cores implementing AArch64. */
1007 static const struct processor all_cores
[] =
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1011 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1012 FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1015 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1016 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1020 /* Target specification. These are populated by the -march, -mtune, -mcpu
1021 handling code or by target attributes. */
1022 static const struct processor
*selected_arch
;
1023 static const struct processor
*selected_cpu
;
1024 static const struct processor
*selected_tune
;
1026 /* The current tuning set. */
1027 struct tune_params aarch64_tune_params
= generic_tunings
;
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031 /* An ISA extension in the co-processor and main instruction set space. */
1032 struct aarch64_option_extension
1034 const char *const name
;
1035 const unsigned long flags_on
;
1036 const unsigned long flags_off
;
1039 typedef enum aarch64_cond_code
1041 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1042 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1043 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049 /* The condition codes of the processor, and the inverse function. */
1050 static const char * const aarch64_condition_codes
[] =
1052 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1056 /* Generate code to enable conditional branches in functions over 1 MiB. */
1058 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1059 const char * branch_format
)
1061 rtx_code_label
* tmp_label
= gen_label_rtx ();
1062 char label_buf
[256];
1064 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1065 CODE_LABEL_NUMBER (tmp_label
));
1066 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1067 rtx dest_label
= operands
[pos_label
];
1068 operands
[pos_label
] = tmp_label
;
1070 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1071 output_asm_insn (buffer
, operands
);
1073 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1074 operands
[pos_label
] = dest_label
;
1075 output_asm_insn (buffer
, operands
);
1080 aarch64_err_no_fpadvsimd (machine_mode mode
)
1082 if (TARGET_GENERAL_REGS_ONLY
)
1083 if (FLOAT_MODE_P (mode
))
1084 error ("%qs is incompatible with the use of floating-point types",
1085 "-mgeneral-regs-only");
1087 error ("%qs is incompatible with the use of vector types",
1088 "-mgeneral-regs-only");
1090 if (FLOAT_MODE_P (mode
))
1091 error ("%qs feature modifier is incompatible with the use of"
1092 " floating-point types", "+nofp");
1094 error ("%qs feature modifier is incompatible with the use of"
1095 " vector types", "+nofp");
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102 and GENERAL_REGS is lower than the memory cost (in this case the best class
1103 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1104 cost results in bad allocations with many redundant int<->FP moves which
1105 are expensive on various cores.
1106 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1108 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1109 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1110 The result of this is that it is no longer inefficient to have a higher
1111 memory move cost than the register move cost.
1115 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1116 reg_class_t best_class
)
1120 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1121 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1122 return allocno_class
;
1124 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1125 || !reg_class_subset_p (FP_REGS
, best_class
))
1128 mode
= PSEUDO_REGNO_MODE (regno
);
1129 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1135 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1136 return aarch64_tune_params
.min_div_recip_mul_sf
;
1137 return aarch64_tune_params
.min_div_recip_mul_df
;
1140 /* Return the reassociation width of treeop OPC with mode MODE. */
1142 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1144 if (VECTOR_MODE_P (mode
))
1145 return aarch64_tune_params
.vec_reassoc_width
;
1146 if (INTEGRAL_MODE_P (mode
))
1147 return aarch64_tune_params
.int_reassoc_width
;
1148 /* Avoid reassociating floating point addition so we emit more FMAs. */
1149 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1150 return aarch64_tune_params
.fp_reassoc_width
;
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1156 aarch64_dbx_register_number (unsigned regno
)
1158 if (GP_REGNUM_P (regno
))
1159 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1160 else if (regno
== SP_REGNUM
)
1161 return AARCH64_DWARF_SP
;
1162 else if (FP_REGNUM_P (regno
))
1163 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1164 else if (PR_REGNUM_P (regno
))
1165 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1166 else if (regno
== VG_REGNUM
)
1167 return AARCH64_DWARF_VG
;
1169 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170 equivalent DWARF register. */
1171 return DWARF_FRAME_REGISTERS
;
1174 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1176 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1179 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1182 /* Return true if MODE is an SVE predicate mode. */
1184 aarch64_sve_pred_mode_p (machine_mode mode
)
1187 && (mode
== VNx16BImode
1188 || mode
== VNx8BImode
1189 || mode
== VNx4BImode
1190 || mode
== VNx2BImode
));
1193 /* Three mutually-exclusive flags describing a vector or predicate type. */
1194 const unsigned int VEC_ADVSIMD
= 1;
1195 const unsigned int VEC_SVE_DATA
= 2;
1196 const unsigned int VEC_SVE_PRED
= 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198 a structure of 2, 3 or 4 vectors. */
1199 const unsigned int VEC_STRUCT
= 8;
1200 /* Useful combinations of the above. */
1201 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1202 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205 Ignore modes that are not supported by the current target. */
1207 aarch64_classify_vector_mode (machine_mode mode
)
1209 if (aarch64_advsimd_struct_mode_p (mode
))
1210 return VEC_ADVSIMD
| VEC_STRUCT
;
1212 if (aarch64_sve_pred_mode_p (mode
))
1213 return VEC_SVE_PRED
;
1215 scalar_mode inner
= GET_MODE_INNER (mode
);
1216 if (VECTOR_MODE_P (mode
)
1223 || inner
== DFmode
))
1227 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1228 return VEC_SVE_DATA
;
1229 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1230 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1231 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1232 return VEC_SVE_DATA
| VEC_STRUCT
;
1235 /* This includes V1DF but not V1DI (which doesn't exist). */
1237 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1238 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1245 /* Return true if MODE is any of the data vector modes, including
1248 aarch64_vector_data_mode_p (machine_mode mode
)
1250 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254 or a structure of vectors. */
1256 aarch64_sve_data_mode_p (machine_mode mode
)
1258 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1261 /* Implement target hook TARGET_ARRAY_MODE. */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1265 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1266 && IN_RANGE (nelems
, 2, 4))
1267 return mode_for_vector (GET_MODE_INNER (mode
),
1268 GET_MODE_NUNITS (mode
) * nelems
);
1270 return opt_machine_mode ();
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1275 aarch64_array_mode_supported_p (machine_mode mode
,
1276 unsigned HOST_WIDE_INT nelems
)
1279 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1280 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1281 && (nelems
>= 2 && nelems
<= 4))
1287 /* Return the SVE predicate mode to use for elements that have
1288 ELEM_NBYTES bytes, if such a mode exists. */
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1295 if (elem_nbytes
== 1)
1297 if (elem_nbytes
== 2)
1299 if (elem_nbytes
== 4)
1301 if (elem_nbytes
== 8)
1304 return opt_machine_mode ();
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1312 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1314 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1315 machine_mode pred_mode
;
1316 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1320 return default_get_mask_mode (nunits
, nbytes
);
1323 /* Implement TARGET_HARD_REGNO_NREGS. */
1326 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1328 /* ??? Logically we should only need to provide a value when
1329 HARD_REGNO_MODE_OK says that the combination is valid,
1330 but at the moment we need to handle all modes. Just ignore
1331 any runtime parts for registers that can't store them. */
1332 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1333 switch (aarch64_regno_regclass (regno
))
1337 if (aarch64_sve_data_mode_p (mode
))
1338 return exact_div (GET_MODE_SIZE (mode
),
1339 BYTES_PER_SVE_VECTOR
).to_constant ();
1340 return CEIL (lowest_size
, UNITS_PER_VREG
);
1346 return CEIL (lowest_size
, UNITS_PER_WORD
);
1351 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1354 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1356 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1357 return regno
== CC_REGNUM
;
1359 if (regno
== VG_REGNUM
)
1360 /* This must have the same size as _Unwind_Word. */
1361 return mode
== DImode
;
1363 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1364 if (vec_flags
& VEC_SVE_PRED
)
1365 return PR_REGNUM_P (regno
);
1367 if (PR_REGNUM_P (regno
))
1370 if (regno
== SP_REGNUM
)
1371 /* The purpose of comparing with ptr_mode is to support the
1372 global register variable associated with the stack pointer
1373 register via the syntax of asm ("wsp") in ILP32. */
1374 return mode
== Pmode
|| mode
== ptr_mode
;
1376 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1377 return mode
== Pmode
;
1379 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1382 if (FP_REGNUM_P (regno
))
1384 if (vec_flags
& VEC_STRUCT
)
1385 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1387 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1393 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1394 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1395 clobbers the top 64 bits when restoring the bottom 64 bits. */
1398 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1400 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1403 /* Implement REGMODE_NATURAL_SIZE. */
1405 aarch64_regmode_natural_size (machine_mode mode
)
1407 /* The natural size for SVE data modes is one SVE data vector,
1408 and similarly for predicates. We can't independently modify
1409 anything smaller than that. */
1410 /* ??? For now, only do this for variable-width SVE registers.
1411 Doing it for constant-sized registers breaks lower-subreg.c. */
1412 /* ??? And once that's fixed, we should probably have similar
1413 code for Advanced SIMD. */
1414 if (!aarch64_sve_vg
.is_constant ())
1416 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1417 if (vec_flags
& VEC_SVE_PRED
)
1418 return BYTES_PER_SVE_PRED
;
1419 if (vec_flags
& VEC_SVE_DATA
)
1420 return BYTES_PER_SVE_VECTOR
;
1422 return UNITS_PER_WORD
;
1425 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1427 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1430 /* The predicate mode determines which bits are significant and
1431 which are "don't care". Decreasing the number of lanes would
1432 lose data while increasing the number of lanes would make bits
1433 unnecessarily significant. */
1434 if (PR_REGNUM_P (regno
))
1436 if (known_ge (GET_MODE_SIZE (mode
), 4))
1442 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1443 that strcpy from constants will be faster. */
1445 static HOST_WIDE_INT
1446 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1448 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1449 return MAX (align
, BITS_PER_WORD
);
1453 /* Return true if calls to DECL should be treated as
1454 long-calls (ie called via a register). */
1456 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1461 /* Return true if calls to symbol-ref SYM should be treated as
1462 long-calls (ie called via a register). */
1464 aarch64_is_long_call_p (rtx sym
)
1466 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1469 /* Return true if calls to symbol-ref SYM should not go through
1473 aarch64_is_noplt_call_p (rtx sym
)
1475 const_tree decl
= SYMBOL_REF_DECL (sym
);
1480 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1481 && !targetm
.binds_local_p (decl
))
1487 /* Return true if the offsets to a zero/sign-extract operation
1488 represent an expression that matches an extend operation. The
1489 operands represent the paramters from
1491 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1493 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1496 HOST_WIDE_INT mult_val
, extract_val
;
1498 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1501 mult_val
= INTVAL (mult_imm
);
1502 extract_val
= INTVAL (extract_imm
);
1505 && extract_val
< GET_MODE_BITSIZE (mode
)
1506 && exact_log2 (extract_val
& ~7) > 0
1507 && (extract_val
& 7) <= 4
1508 && mult_val
== (1 << (extract_val
& 7)))
1514 /* Emit an insn that's a simple single-set. Both the operands must be
1515 known to be valid. */
1516 inline static rtx_insn
*
1517 emit_set_insn (rtx x
, rtx y
)
1519 return emit_insn (gen_rtx_SET (x
, y
));
1522 /* X and Y are two things to compare using CODE. Emit the compare insn and
1523 return the rtx for register 0 in the proper mode. */
1525 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1527 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1528 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1530 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1534 /* Build the SYMBOL_REF for __tls_get_addr. */
1536 static GTY(()) rtx tls_get_addr_libfunc
;
1539 aarch64_tls_get_addr (void)
1541 if (!tls_get_addr_libfunc
)
1542 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1543 return tls_get_addr_libfunc
;
1546 /* Return the TLS model to use for ADDR. */
1548 static enum tls_model
1549 tls_symbolic_operand_type (rtx addr
)
1551 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1552 if (GET_CODE (addr
) == CONST
)
1555 rtx sym
= strip_offset (addr
, &addend
);
1556 if (GET_CODE (sym
) == SYMBOL_REF
)
1557 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1559 else if (GET_CODE (addr
) == SYMBOL_REF
)
1560 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1565 /* We'll allow lo_sum's in addresses in our legitimate addresses
1566 so that combine would take care of combining addresses where
1567 necessary, but for generation purposes, we'll generate the address
1570 tmp = hi (symbol_ref); adrp x1, foo
1571 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1575 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1576 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1580 Load TLS symbol, depending on TLS mechanism and TLS access model.
1582 Global Dynamic - Traditional TLS:
1583 adrp tmp, :tlsgd:imm
1584 add dest, tmp, #:tlsgd_lo12:imm
1587 Global Dynamic - TLS Descriptors:
1588 adrp dest, :tlsdesc:imm
1589 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1590 add dest, dest, #:tlsdesc_lo12:imm
1597 adrp tmp, :gottprel:imm
1598 ldr dest, [tmp, #:gottprel_lo12:imm]
1603 add t0, tp, #:tprel_hi12:imm, lsl #12
1604 add t0, t0, #:tprel_lo12_nc:imm
1608 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1609 enum aarch64_symbol_type type
)
1613 case SYMBOL_SMALL_ABSOLUTE
:
1615 /* In ILP32, the mode of dest can be either SImode or DImode. */
1617 machine_mode mode
= GET_MODE (dest
);
1619 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1621 if (can_create_pseudo_p ())
1622 tmp_reg
= gen_reg_rtx (mode
);
1624 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1625 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1629 case SYMBOL_TINY_ABSOLUTE
:
1630 emit_insn (gen_rtx_SET (dest
, imm
));
1633 case SYMBOL_SMALL_GOT_28K
:
1635 machine_mode mode
= GET_MODE (dest
);
1636 rtx gp_rtx
= pic_offset_table_rtx
;
1640 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1641 here before rtl expand. Tree IVOPT will generate rtl pattern to
1642 decide rtx costs, in which case pic_offset_table_rtx is not
1643 initialized. For that case no need to generate the first adrp
1644 instruction as the final cost for global variable access is
1648 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1649 using the page base as GOT base, the first page may be wasted,
1650 in the worst scenario, there is only 28K space for GOT).
1652 The generate instruction sequence for accessing global variable
1655 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1657 Only one instruction needed. But we must initialize
1658 pic_offset_table_rtx properly. We generate initialize insn for
1659 every global access, and allow CSE to remove all redundant.
1661 The final instruction sequences will look like the following
1662 for multiply global variables access.
1664 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1666 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1667 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1668 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1671 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1672 crtl
->uses_pic_offset_table
= 1;
1673 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1675 if (mode
!= GET_MODE (gp_rtx
))
1676 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1680 if (mode
== ptr_mode
)
1683 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1685 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1687 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1691 gcc_assert (mode
== Pmode
);
1693 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1694 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1697 /* The operand is expected to be MEM. Whenever the related insn
1698 pattern changed, above code which calculate mem should be
1700 gcc_assert (GET_CODE (mem
) == MEM
);
1701 MEM_READONLY_P (mem
) = 1;
1702 MEM_NOTRAP_P (mem
) = 1;
1707 case SYMBOL_SMALL_GOT_4G
:
1709 /* In ILP32, the mode of dest can be either SImode or DImode,
1710 while the got entry is always of SImode size. The mode of
1711 dest depends on how dest is used: if dest is assigned to a
1712 pointer (e.g. in the memory), it has SImode; it may have
1713 DImode if dest is dereferenced to access the memeory.
1714 This is why we have to handle three different ldr_got_small
1715 patterns here (two patterns for ILP32). */
1720 machine_mode mode
= GET_MODE (dest
);
1722 if (can_create_pseudo_p ())
1723 tmp_reg
= gen_reg_rtx (mode
);
1725 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1726 if (mode
== ptr_mode
)
1729 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1731 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1733 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1737 gcc_assert (mode
== Pmode
);
1739 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1740 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1743 gcc_assert (GET_CODE (mem
) == MEM
);
1744 MEM_READONLY_P (mem
) = 1;
1745 MEM_NOTRAP_P (mem
) = 1;
1750 case SYMBOL_SMALL_TLSGD
:
1753 machine_mode mode
= GET_MODE (dest
);
1754 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1758 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1760 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1761 insns
= get_insns ();
1764 RTL_CONST_CALL_P (insns
) = 1;
1765 emit_libcall_block (insns
, dest
, result
, imm
);
1769 case SYMBOL_SMALL_TLSDESC
:
1771 machine_mode mode
= GET_MODE (dest
);
1772 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1775 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1777 /* In ILP32, the got entry is always of SImode size. Unlike
1778 small GOT, the dest is fixed at reg 0. */
1780 emit_insn (gen_tlsdesc_small_si (imm
));
1782 emit_insn (gen_tlsdesc_small_di (imm
));
1783 tp
= aarch64_load_tp (NULL
);
1786 tp
= gen_lowpart (mode
, tp
);
1788 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1790 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1794 case SYMBOL_SMALL_TLSIE
:
1796 /* In ILP32, the mode of dest can be either SImode or DImode,
1797 while the got entry is always of SImode size. The mode of
1798 dest depends on how dest is used: if dest is assigned to a
1799 pointer (e.g. in the memory), it has SImode; it may have
1800 DImode if dest is dereferenced to access the memeory.
1801 This is why we have to handle three different tlsie_small
1802 patterns here (two patterns for ILP32). */
1803 machine_mode mode
= GET_MODE (dest
);
1804 rtx tmp_reg
= gen_reg_rtx (mode
);
1805 rtx tp
= aarch64_load_tp (NULL
);
1807 if (mode
== ptr_mode
)
1810 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1813 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1814 tp
= gen_lowpart (mode
, tp
);
1819 gcc_assert (mode
== Pmode
);
1820 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1823 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1825 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1829 case SYMBOL_TLSLE12
:
1830 case SYMBOL_TLSLE24
:
1831 case SYMBOL_TLSLE32
:
1832 case SYMBOL_TLSLE48
:
1834 machine_mode mode
= GET_MODE (dest
);
1835 rtx tp
= aarch64_load_tp (NULL
);
1838 tp
= gen_lowpart (mode
, tp
);
1842 case SYMBOL_TLSLE12
:
1843 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1846 case SYMBOL_TLSLE24
:
1847 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1850 case SYMBOL_TLSLE32
:
1851 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1853 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1856 case SYMBOL_TLSLE48
:
1857 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1859 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1867 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1871 case SYMBOL_TINY_GOT
:
1872 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1875 case SYMBOL_TINY_TLSIE
:
1877 machine_mode mode
= GET_MODE (dest
);
1878 rtx tp
= aarch64_load_tp (NULL
);
1880 if (mode
== ptr_mode
)
1883 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1886 tp
= gen_lowpart (mode
, tp
);
1887 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1892 gcc_assert (mode
== Pmode
);
1893 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1897 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1906 /* Emit a move from SRC to DEST. Assume that the move expanders can
1907 handle all moves if !can_create_pseudo_p (). The distinction is
1908 important because, unlike emit_move_insn, the move expanders know
1909 how to force Pmode objects into the constant pool even when the
1910 constant pool address is not itself legitimate. */
1912 aarch64_emit_move (rtx dest
, rtx src
)
1914 return (can_create_pseudo_p ()
1915 ? emit_move_insn (dest
, src
)
1916 : emit_move_insn_1 (dest
, src
));
1919 /* Apply UNOPTAB to OP and store the result in DEST. */
1922 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
1924 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
1926 emit_move_insn (dest
, tmp
);
1929 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1932 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
1934 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
1937 emit_move_insn (dest
, tmp
);
1940 /* Split a 128-bit move operation into two 64-bit move operations,
1941 taking care to handle partial overlap of register to register
1942 copies. Special cases are needed when moving between GP regs and
1943 FP regs. SRC can be a register, constant or memory; DST a register
1944 or memory. If either operand is memory it must not have any side
1947 aarch64_split_128bit_move (rtx dst
, rtx src
)
1952 machine_mode mode
= GET_MODE (dst
);
1954 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1955 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1956 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1958 if (REG_P (dst
) && REG_P (src
))
1960 int src_regno
= REGNO (src
);
1961 int dst_regno
= REGNO (dst
);
1963 /* Handle FP <-> GP regs. */
1964 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1966 src_lo
= gen_lowpart (word_mode
, src
);
1967 src_hi
= gen_highpart (word_mode
, src
);
1971 emit_insn (gen_aarch64_movtilow_di (dst
, src_lo
));
1972 emit_insn (gen_aarch64_movtihigh_di (dst
, src_hi
));
1976 emit_insn (gen_aarch64_movtflow_di (dst
, src_lo
));
1977 emit_insn (gen_aarch64_movtfhigh_di (dst
, src_hi
));
1981 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1983 dst_lo
= gen_lowpart (word_mode
, dst
);
1984 dst_hi
= gen_highpart (word_mode
, dst
);
1988 emit_insn (gen_aarch64_movdi_tilow (dst_lo
, src
));
1989 emit_insn (gen_aarch64_movdi_tihigh (dst_hi
, src
));
1993 emit_insn (gen_aarch64_movdi_tflow (dst_lo
, src
));
1994 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi
, src
));
2000 dst_lo
= gen_lowpart (word_mode
, dst
);
2001 dst_hi
= gen_highpart (word_mode
, dst
);
2002 src_lo
= gen_lowpart (word_mode
, src
);
2003 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2005 /* At most one pairing may overlap. */
2006 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2008 aarch64_emit_move (dst_hi
, src_hi
);
2009 aarch64_emit_move (dst_lo
, src_lo
);
2013 aarch64_emit_move (dst_lo
, src_lo
);
2014 aarch64_emit_move (dst_hi
, src_hi
);
2019 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2021 return (! REG_P (src
)
2022 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2025 /* Split a complex SIMD combine. */
2028 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2030 machine_mode src_mode
= GET_MODE (src1
);
2031 machine_mode dst_mode
= GET_MODE (dst
);
2033 gcc_assert (VECTOR_MODE_P (dst_mode
));
2034 gcc_assert (register_operand (dst
, dst_mode
)
2035 && register_operand (src1
, src_mode
)
2036 && register_operand (src2
, src_mode
));
2038 rtx (*gen
) (rtx
, rtx
, rtx
);
2043 gen
= gen_aarch64_simd_combinev8qi
;
2046 gen
= gen_aarch64_simd_combinev4hi
;
2049 gen
= gen_aarch64_simd_combinev2si
;
2052 gen
= gen_aarch64_simd_combinev4hf
;
2055 gen
= gen_aarch64_simd_combinev2sf
;
2058 gen
= gen_aarch64_simd_combinedi
;
2061 gen
= gen_aarch64_simd_combinedf
;
2067 emit_insn (gen (dst
, src1
, src2
));
2071 /* Split a complex SIMD move. */
2074 aarch64_split_simd_move (rtx dst
, rtx src
)
2076 machine_mode src_mode
= GET_MODE (src
);
2077 machine_mode dst_mode
= GET_MODE (dst
);
2079 gcc_assert (VECTOR_MODE_P (dst_mode
));
2081 if (REG_P (dst
) && REG_P (src
))
2083 rtx (*gen
) (rtx
, rtx
);
2085 gcc_assert (VECTOR_MODE_P (src_mode
));
2090 gen
= gen_aarch64_split_simd_movv16qi
;
2093 gen
= gen_aarch64_split_simd_movv8hi
;
2096 gen
= gen_aarch64_split_simd_movv4si
;
2099 gen
= gen_aarch64_split_simd_movv2di
;
2102 gen
= gen_aarch64_split_simd_movv8hf
;
2105 gen
= gen_aarch64_split_simd_movv4sf
;
2108 gen
= gen_aarch64_split_simd_movv2df
;
2114 emit_insn (gen (dst
, src
));
2120 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2121 machine_mode ymode
, rtx y
)
2123 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2124 gcc_assert (r
!= NULL
);
2125 return rtx_equal_p (x
, r
);
2130 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2132 if (can_create_pseudo_p ())
2133 return force_reg (mode
, value
);
2137 aarch64_emit_move (x
, value
);
2142 /* Return true if we can move VALUE into a register using a single
2143 CNT[BHWD] instruction. */
2146 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2148 HOST_WIDE_INT factor
= value
.coeffs
[0];
2149 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2150 return (value
.coeffs
[1] == factor
2151 && IN_RANGE (factor
, 2, 16 * 16)
2152 && (factor
& 1) == 0
2153 && factor
<= 16 * (factor
& -factor
));
2156 /* Likewise for rtx X. */
2159 aarch64_sve_cnt_immediate_p (rtx x
)
2162 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2165 /* Return the asm string for an instruction with a CNT-like vector size
2166 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2167 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2168 first part of the operands template (the part that comes before the
2169 vector size itself). FACTOR is the number of quadwords.
2170 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2171 If it is zero, we can use any element size. */
2174 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2175 unsigned int factor
,
2176 unsigned int nelts_per_vq
)
2178 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2180 if (nelts_per_vq
== 0)
2181 /* There is some overlap in the ranges of the four CNT instructions.
2182 Here we always use the smallest possible element size, so that the
2183 multiplier is 1 whereever possible. */
2184 nelts_per_vq
= factor
& -factor
;
2185 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2186 gcc_assert (IN_RANGE (shift
, 1, 4));
2187 char suffix
= "dwhb"[shift
- 1];
2190 unsigned int written
;
2192 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2193 prefix
, suffix
, operands
);
2195 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2196 prefix
, suffix
, operands
, factor
);
2197 gcc_assert (written
< sizeof (buffer
));
2201 /* Return the asm string for an instruction with a CNT-like vector size
2202 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2203 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2204 first part of the operands template (the part that comes before the
2205 vector size itself). X is the value of the vector size operand,
2206 as a polynomial integer rtx. */
2209 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2212 poly_int64 value
= rtx_to_poly_int64 (x
);
2213 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2214 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2215 value
.coeffs
[1], 0);
2218 /* Return true if we can add VALUE to a register using a single ADDVL
2219 or ADDPL instruction. */
2222 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2224 HOST_WIDE_INT factor
= value
.coeffs
[0];
2225 if (factor
== 0 || value
.coeffs
[1] != factor
)
2227 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2228 and a value of 16 is one vector width. */
2229 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2230 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2233 /* Likewise for rtx X. */
2236 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2239 return (poly_int_rtx_p (x
, &value
)
2240 && aarch64_sve_addvl_addpl_immediate_p (value
));
2243 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2244 and storing the result in operand 0. */
2247 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2249 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2250 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2251 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2253 /* Use INC or DEC if possible. */
2254 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2256 if (aarch64_sve_cnt_immediate_p (offset_value
))
2257 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2258 offset_value
.coeffs
[1], 0);
2259 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2260 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2261 -offset_value
.coeffs
[1], 0);
2264 int factor
= offset_value
.coeffs
[1];
2265 if ((factor
& 15) == 0)
2266 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2268 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2272 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2273 instruction. If it is, store the number of elements in each vector
2274 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2275 factor in *FACTOR_OUT (if nonnull). */
2278 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2279 unsigned int *nelts_per_vq_out
)
2284 if (!const_vec_duplicate_p (x
, &elt
)
2285 || !poly_int_rtx_p (elt
, &value
))
2288 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2289 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2290 /* There's no vector INCB. */
2293 HOST_WIDE_INT factor
= value
.coeffs
[0];
2294 if (value
.coeffs
[1] != factor
)
2297 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2298 if ((factor
% nelts_per_vq
) != 0
2299 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2303 *factor_out
= factor
;
2304 if (nelts_per_vq_out
)
2305 *nelts_per_vq_out
= nelts_per_vq
;
2309 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2313 aarch64_sve_inc_dec_immediate_p (rtx x
)
2315 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2318 /* Return the asm template for an SVE vector INC or DEC instruction.
2319 OPERANDS gives the operands before the vector count and X is the
2320 value of the vector count operand itself. */
2323 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2326 unsigned int nelts_per_vq
;
2327 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2330 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2333 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2338 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2339 scalar_int_mode mode
)
2342 unsigned HOST_WIDE_INT val
, val2
, mask
;
2343 int one_match
, zero_match
;
2348 if (aarch64_move_imm (val
, mode
))
2351 emit_insn (gen_rtx_SET (dest
, imm
));
2355 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2356 (with XXXX non-zero). In that case check to see if the move can be done in
2358 val2
= val
& 0xffffffff;
2360 && aarch64_move_imm (val2
, SImode
)
2361 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2364 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2366 /* Check if we have to emit a second instruction by checking to see
2367 if any of the upper 32 bits of the original DI mode value is set. */
2371 i
= (val
>> 48) ? 48 : 32;
2374 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2375 GEN_INT ((val
>> i
) & 0xffff)));
2380 if ((val
>> 32) == 0 || mode
== SImode
)
2384 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2386 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2387 GEN_INT ((val
>> 16) & 0xffff)));
2389 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2390 GEN_INT ((val
>> 16) & 0xffff)));
2395 /* Remaining cases are all for DImode. */
2398 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2399 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2400 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2401 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2403 if (zero_match
!= 2 && one_match
!= 2)
2405 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2406 For a 64-bit bitmask try whether changing 16 bits to all ones or
2407 zeroes creates a valid bitmask. To check any repeated bitmask,
2408 try using 16 bits from the other 32-bit half of val. */
2410 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2413 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2416 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2418 val2
= val2
& ~mask
;
2419 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2420 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2427 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2428 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2429 GEN_INT ((val
>> i
) & 0xffff)));
2435 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2436 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2437 otherwise skip zero bits. */
2441 val2
= one_match
> zero_match
? ~val
: val
;
2442 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2445 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2446 ? (val
| ~(mask
<< i
))
2447 : (val
& (mask
<< i
)))));
2448 for (i
+= 16; i
< 64; i
+= 16)
2450 if ((val2
& (mask
<< i
)) == 0)
2453 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2454 GEN_INT ((val
>> i
) & 0xffff)));
2461 /* Return whether imm is a 128-bit immediate which is simple enough to
2464 aarch64_mov128_immediate (rtx imm
)
2466 if (GET_CODE (imm
) == CONST_INT
)
2469 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2471 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2472 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2474 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2475 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2479 /* Return the number of temporary registers that aarch64_add_offset_1
2480 would need to add OFFSET to a register. */
2483 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2485 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2488 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2489 a non-polynomial OFFSET. MODE is the mode of the addition.
2490 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2491 be set and CFA adjustments added to the generated instructions.
2493 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2494 temporary if register allocation is already complete. This temporary
2495 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2496 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2497 the immediate again.
2499 Since this function may be used to adjust the stack pointer, we must
2500 ensure that it cannot cause transient stack deallocation (for example
2501 by first incrementing SP and then decrementing when adjusting by a
2502 large immediate). */
2505 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2506 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2507 bool frame_related_p
, bool emit_move_imm
)
2509 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2510 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2512 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2517 if (!rtx_equal_p (dest
, src
))
2519 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2520 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2525 /* Single instruction adjustment. */
2526 if (aarch64_uimm12_shift (moffset
))
2528 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2529 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2533 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2536 a) the offset cannot be loaded by a 16-bit move or
2537 b) there is no spare register into which we can move it. */
2538 if (moffset
< 0x1000000
2539 && ((!temp1
&& !can_create_pseudo_p ())
2540 || !aarch64_move_imm (moffset
, mode
)))
2542 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2544 low_off
= offset
< 0 ? -low_off
: low_off
;
2545 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2546 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2547 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2548 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2552 /* Emit a move immediate if required and an addition/subtraction. */
2555 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2556 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2558 insn
= emit_insn (offset
< 0
2559 ? gen_sub3_insn (dest
, src
, temp1
)
2560 : gen_add3_insn (dest
, src
, temp1
));
2561 if (frame_related_p
)
2563 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2564 rtx adj
= plus_constant (mode
, src
, offset
);
2565 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2569 /* Return the number of temporary registers that aarch64_add_offset
2570 would need to move OFFSET into a register or add OFFSET to a register;
2571 ADD_P is true if we want the latter rather than the former. */
2574 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2576 /* This follows the same structure as aarch64_add_offset. */
2577 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2580 unsigned int count
= 0;
2581 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2582 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2583 poly_int64
poly_offset (factor
, factor
);
2584 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2585 /* Need one register for the ADDVL/ADDPL result. */
2587 else if (factor
!= 0)
2589 factor
= abs (factor
);
2590 if (factor
> 16 * (factor
& -factor
))
2591 /* Need one register for the CNT result and one for the multiplication
2592 factor. If necessary, the second temporary can be reused for the
2593 constant part of the offset. */
2595 /* Need one register for the CNT result (which might then
2599 return count
+ aarch64_add_offset_1_temporaries (constant
);
2602 /* If X can be represented as a poly_int64, return the number
2603 of temporaries that are required to add it to a register.
2604 Return -1 otherwise. */
2607 aarch64_add_offset_temporaries (rtx x
)
2610 if (!poly_int_rtx_p (x
, &offset
))
2612 return aarch64_offset_temporaries (true, offset
);
2615 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2616 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2617 be set and CFA adjustments added to the generated instructions.
2619 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2620 temporary if register allocation is already complete. This temporary
2621 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2622 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2623 false to avoid emitting the immediate again.
2625 TEMP2, if nonnull, is a second temporary register that doesn't
2626 overlap either DEST or REG.
2628 Since this function may be used to adjust the stack pointer, we must
2629 ensure that it cannot cause transient stack deallocation (for example
2630 by first incrementing SP and then decrementing when adjusting by a
2631 large immediate). */
2634 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2635 poly_int64 offset
, rtx temp1
, rtx temp2
,
2636 bool frame_related_p
, bool emit_move_imm
= true)
2638 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2639 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2640 gcc_assert (temp1
== NULL_RTX
2642 || !reg_overlap_mentioned_p (temp1
, dest
));
2643 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2645 /* Try using ADDVL or ADDPL to add the whole value. */
2646 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2648 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2649 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2650 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2654 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2655 SVE vector register, over and above the minimum size of 128 bits.
2656 This is equivalent to half the value returned by CNTD with a
2657 vector shape of ALL. */
2658 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2659 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2661 /* Try using ADDVL or ADDPL to add the VG-based part. */
2662 poly_int64
poly_offset (factor
, factor
);
2663 if (src
!= const0_rtx
2664 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2666 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2667 if (frame_related_p
)
2669 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2670 RTX_FRAME_RELATED_P (insn
) = true;
2675 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2676 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2681 /* Otherwise use a CNT-based sequence. */
2682 else if (factor
!= 0)
2684 /* Use a subtraction if we have a negative factor. */
2685 rtx_code code
= PLUS
;
2692 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2693 into the multiplication. */
2697 /* Use a right shift by 1. */
2701 HOST_WIDE_INT low_bit
= factor
& -factor
;
2702 if (factor
<= 16 * low_bit
)
2704 if (factor
> 16 * 8)
2706 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2707 the value with the minimum multiplier and shift it into
2709 int extra_shift
= exact_log2 (low_bit
);
2710 shift
+= extra_shift
;
2711 factor
>>= extra_shift
;
2713 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2717 /* Use CNTD, then multiply it by FACTOR. */
2718 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2719 val
= aarch64_force_temporary (mode
, temp1
, val
);
2721 /* Go back to using a negative multiplication factor if we have
2722 no register from which to subtract. */
2723 if (code
== MINUS
&& src
== const0_rtx
)
2728 rtx coeff1
= gen_int_mode (factor
, mode
);
2729 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2730 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2735 /* Multiply by 1 << SHIFT. */
2736 val
= aarch64_force_temporary (mode
, temp1
, val
);
2737 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2739 else if (shift
== -1)
2742 val
= aarch64_force_temporary (mode
, temp1
, val
);
2743 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2746 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2747 if (src
!= const0_rtx
)
2749 val
= aarch64_force_temporary (mode
, temp1
, val
);
2750 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2752 else if (code
== MINUS
)
2754 val
= aarch64_force_temporary (mode
, temp1
, val
);
2755 val
= gen_rtx_NEG (mode
, val
);
2758 if (constant
== 0 || frame_related_p
)
2760 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2761 if (frame_related_p
)
2763 RTX_FRAME_RELATED_P (insn
) = true;
2764 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2765 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2774 src
= aarch64_force_temporary (mode
, temp1
, val
);
2779 emit_move_imm
= true;
2782 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2783 frame_related_p
, emit_move_imm
);
2786 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2787 than a poly_int64. */
2790 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2791 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2793 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2794 temp1
, temp2
, false);
2797 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2798 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2799 if TEMP1 already contains abs (DELTA). */
2802 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2804 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2805 temp1
, temp2
, true, emit_move_imm
);
2808 /* Subtract DELTA from the stack pointer, marking the instructions
2809 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2813 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2815 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2816 temp1
, temp2
, frame_related_p
);
2819 /* Set DEST to (vec_series BASE STEP). */
2822 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2824 machine_mode mode
= GET_MODE (dest
);
2825 scalar_mode inner
= GET_MODE_INNER (mode
);
2827 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2828 if (!aarch64_sve_index_immediate_p (base
))
2829 base
= force_reg (inner
, base
);
2830 if (!aarch64_sve_index_immediate_p (step
))
2831 step
= force_reg (inner
, step
);
2833 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2836 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2837 integer of mode INT_MODE. Return true on success. */
2840 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2843 /* If the constant is smaller than 128 bits, we can do the move
2844 using a vector of SRC_MODEs. */
2845 if (src_mode
!= TImode
)
2847 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2848 GET_MODE_SIZE (src_mode
));
2849 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2850 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2851 gen_const_vec_duplicate (dup_mode
, src
));
2855 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2856 src
= force_const_mem (src_mode
, src
);
2860 /* Make sure that the address is legitimate. */
2861 if (!aarch64_sve_ld1r_operand_p (src
))
2863 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2864 src
= replace_equiv_address (src
, addr
);
2867 machine_mode mode
= GET_MODE (dest
);
2868 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
2869 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
2870 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
2871 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
2872 emit_insn (gen_rtx_SET (dest
, src
));
2876 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2877 isn't a simple duplicate or series. */
2880 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2882 machine_mode mode
= GET_MODE (src
);
2883 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2884 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2885 gcc_assert (npatterns
> 1);
2887 if (nelts_per_pattern
== 1)
2889 /* The constant is a repeating seqeuence of at least two elements,
2890 where the repeating elements occupy no more than 128 bits.
2891 Get an integer representation of the replicated value. */
2892 scalar_int_mode int_mode
;
2893 if (BYTES_BIG_ENDIAN
)
2894 /* For now, always use LD1RQ to load the value on big-endian
2895 targets, since the handling of smaller integers includes a
2896 subreg that is semantically an element reverse. */
2900 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2901 gcc_assert (int_bits
<= 128);
2902 int_mode
= int_mode_for_size (int_bits
, 0).require ();
2904 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2906 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2910 /* Expand each pattern individually. */
2911 rtx_vector_builder builder
;
2912 auto_vec
<rtx
, 16> vectors (npatterns
);
2913 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2915 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2916 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2917 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2918 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2921 /* Use permutes to interleave the separate vectors. */
2922 while (npatterns
> 1)
2925 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2927 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2928 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2929 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2933 gcc_assert (vectors
[0] == dest
);
2936 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2937 is a pattern that can be used to set DEST to a replicated scalar
2941 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2942 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2944 machine_mode mode
= GET_MODE (dest
);
2946 /* Check on what type of symbol it is. */
2947 scalar_int_mode int_mode
;
2948 if ((GET_CODE (imm
) == SYMBOL_REF
2949 || GET_CODE (imm
) == LABEL_REF
2950 || GET_CODE (imm
) == CONST
2951 || GET_CODE (imm
) == CONST_POLY_INT
)
2952 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2956 HOST_WIDE_INT const_offset
;
2957 enum aarch64_symbol_type sty
;
2959 /* If we have (const (plus symbol offset)), separate out the offset
2960 before we start classifying the symbol. */
2961 rtx base
= strip_offset (imm
, &offset
);
2963 /* We must always add an offset involving VL separately, rather than
2964 folding it into the relocation. */
2965 if (!offset
.is_constant (&const_offset
))
2967 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2968 emit_insn (gen_rtx_SET (dest
, imm
));
2971 /* Do arithmetic on 32-bit values if the result is smaller
2973 if (partial_subreg_p (int_mode
, SImode
))
2975 /* It is invalid to do symbol calculations in modes
2976 narrower than SImode. */
2977 gcc_assert (base
== const0_rtx
);
2978 dest
= gen_lowpart (SImode
, dest
);
2981 if (base
!= const0_rtx
)
2983 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2984 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2985 NULL_RTX
, NULL_RTX
, false);
2988 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2989 dest
, NULL_RTX
, false);
2994 sty
= aarch64_classify_symbol (base
, const_offset
);
2997 case SYMBOL_FORCE_TO_MEM
:
2998 if (const_offset
!= 0
2999 && targetm
.cannot_force_const_mem (int_mode
, imm
))
3001 gcc_assert (can_create_pseudo_p ());
3002 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3003 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3004 NULL_RTX
, NULL_RTX
, false);
3008 mem
= force_const_mem (ptr_mode
, imm
);
3011 /* If we aren't generating PC relative literals, then
3012 we need to expand the literal pool access carefully.
3013 This is something that needs to be done in a number
3014 of places, so could well live as a separate function. */
3015 if (!aarch64_pcrelative_literal_loads
)
3017 gcc_assert (can_create_pseudo_p ());
3018 base
= gen_reg_rtx (ptr_mode
);
3019 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
3020 if (ptr_mode
!= Pmode
)
3021 base
= convert_memory_address (Pmode
, base
);
3022 mem
= gen_rtx_MEM (ptr_mode
, base
);
3025 if (int_mode
!= ptr_mode
)
3026 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
3028 emit_insn (gen_rtx_SET (dest
, mem
));
3032 case SYMBOL_SMALL_TLSGD
:
3033 case SYMBOL_SMALL_TLSDESC
:
3034 case SYMBOL_SMALL_TLSIE
:
3035 case SYMBOL_SMALL_GOT_28K
:
3036 case SYMBOL_SMALL_GOT_4G
:
3037 case SYMBOL_TINY_GOT
:
3038 case SYMBOL_TINY_TLSIE
:
3039 if (const_offset
!= 0)
3041 gcc_assert(can_create_pseudo_p ());
3042 base
= aarch64_force_temporary (int_mode
, dest
, base
);
3043 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
3044 NULL_RTX
, NULL_RTX
, false);
3049 case SYMBOL_SMALL_ABSOLUTE
:
3050 case SYMBOL_TINY_ABSOLUTE
:
3051 case SYMBOL_TLSLE12
:
3052 case SYMBOL_TLSLE24
:
3053 case SYMBOL_TLSLE32
:
3054 case SYMBOL_TLSLE48
:
3055 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3063 if (!CONST_INT_P (imm
))
3065 rtx base
, step
, value
;
3066 if (GET_CODE (imm
) == HIGH
3067 || aarch64_simd_valid_immediate (imm
, NULL
))
3068 emit_insn (gen_rtx_SET (dest
, imm
));
3069 else if (const_vec_series_p (imm
, &base
, &step
))
3070 aarch64_expand_vec_series (dest
, base
, step
);
3071 else if (const_vec_duplicate_p (imm
, &value
))
3073 /* If the constant is out of range of an SVE vector move,
3074 load it from memory if we can, otherwise move it into
3075 a register and use a DUP. */
3076 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3077 rtx op
= force_const_mem (inner_mode
, value
);
3079 op
= force_reg (inner_mode
, value
);
3080 else if (!aarch64_sve_ld1r_operand_p (op
))
3082 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3083 op
= replace_equiv_address (op
, addr
);
3085 emit_insn (gen_vec_duplicate (dest
, op
));
3087 else if (GET_CODE (imm
) == CONST_VECTOR
3088 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3089 aarch64_expand_sve_const_vector (dest
, imm
);
3092 rtx mem
= force_const_mem (mode
, imm
);
3094 emit_move_insn (dest
, mem
);
3100 aarch64_internal_mov_immediate (dest
, imm
, true,
3101 as_a
<scalar_int_mode
> (mode
));
3104 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3105 that is known to contain PTRUE. */
3108 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3110 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3111 gen_rtvec (2, pred
, src
),
3112 UNSPEC_MERGE_PTRUE
)));
3115 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3116 operand is in memory. In this case we need to use the predicated LD1
3117 and ST1 instead of LDR and STR, both for correctness on big-endian
3118 targets and because LD1 and ST1 support a wider range of addressing modes.
3119 PRED_MODE is the mode of the predicate.
3121 See the comment at the head of aarch64-sve.md for details about the
3122 big-endian handling. */
3125 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3127 machine_mode mode
= GET_MODE (dest
);
3128 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3129 if (!register_operand (src
, mode
)
3130 && !register_operand (dest
, mode
))
3132 rtx tmp
= gen_reg_rtx (mode
);
3134 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3136 emit_move_insn (tmp
, src
);
3139 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3142 /* Called only on big-endian targets. See whether an SVE vector move
3143 from SRC to DEST is effectively a REV[BHW] instruction, because at
3144 least one operand is a subreg of an SVE vector that has wider or
3145 narrower elements. Return true and emit the instruction if so.
3149 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3151 represents a VIEW_CONVERT between the following vectors, viewed
3154 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3155 R1: { [0], [1], [2], [3], ... }
3157 The high part of lane X in R2 should therefore correspond to lane X*2
3158 of R1, but the register representations are:
3161 R2: ...... [1].high [1].low [0].high [0].low
3162 R1: ...... [3] [2] [1] [0]
3164 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3165 We therefore need a reverse operation to swap the high and low values
3168 This is purely an optimization. Without it we would spill the
3169 subreg operand to the stack in one mode and reload it in the
3170 other mode, which has the same effect as the REV. */
3173 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3175 gcc_assert (BYTES_BIG_ENDIAN
);
3176 if (GET_CODE (dest
) == SUBREG
)
3177 dest
= SUBREG_REG (dest
);
3178 if (GET_CODE (src
) == SUBREG
)
3179 src
= SUBREG_REG (src
);
3181 /* The optimization handles two single SVE REGs with different element
3185 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3186 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3187 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3188 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3191 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3192 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3193 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3195 emit_insn (gen_rtx_SET (dest
, unspec
));
3199 /* Return a copy of X with mode MODE, without changing its other
3200 attributes. Unlike gen_lowpart, this doesn't care whether the
3201 mode change is valid. */
3204 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3206 if (GET_MODE (x
) == mode
)
3209 x
= shallow_copy_rtx (x
);
3210 set_mode_and_regno (x
, mode
, REGNO (x
));
3214 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3218 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3220 /* Decide which REV operation we need. The mode with narrower elements
3221 determines the mode of the operands and the mode with the wider
3222 elements determines the reverse width. */
3223 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3224 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3225 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3226 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3227 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3229 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3230 unsigned int unspec
;
3231 if (wider_bytes
== 8)
3232 unspec
= UNSPEC_REV64
;
3233 else if (wider_bytes
== 4)
3234 unspec
= UNSPEC_REV32
;
3235 else if (wider_bytes
== 2)
3236 unspec
= UNSPEC_REV16
;
3239 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3243 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3244 UNSPEC_MERGE_PTRUE))
3246 with the appropriate modes. */
3247 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3248 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3249 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3250 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3251 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3252 UNSPEC_MERGE_PTRUE
);
3253 emit_insn (gen_rtx_SET (dest
, src
));
3257 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3258 tree exp ATTRIBUTE_UNUSED
)
3260 /* Currently, always true. */
3264 /* Implement TARGET_PASS_BY_REFERENCE. */
3267 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3270 bool named ATTRIBUTE_UNUSED
)
3273 machine_mode dummymode
;
3276 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3277 if (mode
== BLKmode
&& type
)
3278 size
= int_size_in_bytes (type
);
3280 /* No frontends can create types with variable-sized modes, so we
3281 shouldn't be asked to pass or return them. */
3282 size
= GET_MODE_SIZE (mode
).to_constant ();
3284 /* Aggregates are passed by reference based on their size. */
3285 if (type
&& AGGREGATE_TYPE_P (type
))
3287 size
= int_size_in_bytes (type
);
3290 /* Variable sized arguments are always returned by reference. */
3294 /* Can this be a candidate to be passed in fp/simd register(s)? */
3295 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3300 /* Arguments which are variable sized or larger than 2 registers are
3301 passed by reference unless they are a homogenous floating point
3303 return size
> 2 * UNITS_PER_WORD
;
3306 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3308 aarch64_return_in_msb (const_tree valtype
)
3310 machine_mode dummy_mode
;
3313 /* Never happens in little-endian mode. */
3314 if (!BYTES_BIG_ENDIAN
)
3317 /* Only composite types smaller than or equal to 16 bytes can
3318 be potentially returned in registers. */
3319 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3320 || int_size_in_bytes (valtype
) <= 0
3321 || int_size_in_bytes (valtype
) > 16)
3324 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3325 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3326 is always passed/returned in the least significant bits of fp/simd
3328 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3329 &dummy_mode
, &dummy_int
, NULL
))
3335 /* Implement TARGET_FUNCTION_VALUE.
3336 Define how to find the value returned by a function. */
3339 aarch64_function_value (const_tree type
, const_tree func
,
3340 bool outgoing ATTRIBUTE_UNUSED
)
3345 machine_mode ag_mode
;
3347 mode
= TYPE_MODE (type
);
3348 if (INTEGRAL_TYPE_P (type
))
3349 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3351 if (aarch64_return_in_msb (type
))
3353 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3355 if (size
% UNITS_PER_WORD
!= 0)
3357 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3358 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3362 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3363 &ag_mode
, &count
, NULL
))
3365 if (!aarch64_composite_type_p (type
, mode
))
3367 gcc_assert (count
== 1 && mode
== ag_mode
);
3368 return gen_rtx_REG (mode
, V0_REGNUM
);
3375 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3376 for (i
= 0; i
< count
; i
++)
3378 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3379 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3380 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3381 XVECEXP (par
, 0, i
) = tmp
;
3387 return gen_rtx_REG (mode
, R0_REGNUM
);
3390 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3391 Return true if REGNO is the number of a hard register in which the values
3392 of called function may come back. */
3395 aarch64_function_value_regno_p (const unsigned int regno
)
3397 /* Maximum of 16 bytes can be returned in the general registers. Examples
3398 of 16-byte return values are: 128-bit integers and 16-byte small
3399 structures (excluding homogeneous floating-point aggregates). */
3400 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3403 /* Up to four fp/simd registers can return a function value, e.g. a
3404 homogeneous floating-point aggregate having four members. */
3405 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3406 return TARGET_FLOAT
;
3411 /* Implement TARGET_RETURN_IN_MEMORY.
3413 If the type T of the result of a function is such that
3415 would require that arg be passed as a value in a register (or set of
3416 registers) according to the parameter passing rules, then the result
3417 is returned in the same registers as would be used for such an
3421 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3424 machine_mode ag_mode
;
3427 if (!AGGREGATE_TYPE_P (type
)
3428 && TREE_CODE (type
) != COMPLEX_TYPE
3429 && TREE_CODE (type
) != VECTOR_TYPE
)
3430 /* Simple scalar types always returned in registers. */
3433 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3440 /* Types larger than 2 registers returned in memory. */
3441 size
= int_size_in_bytes (type
);
3442 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3446 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3447 const_tree type
, int *nregs
)
3449 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3450 return aarch64_vfp_is_call_or_return_candidate (mode
,
3452 &pcum
->aapcs_vfp_rmode
,
3457 /* Given MODE and TYPE of a function argument, return the alignment in
3458 bits. The idea is to suppress any stronger alignment requested by
3459 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3460 This is a helper function for local use only. */
3463 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3466 return GET_MODE_ALIGNMENT (mode
);
3468 if (integer_zerop (TYPE_SIZE (type
)))
3471 gcc_assert (TYPE_MODE (type
) == mode
);
3473 if (!AGGREGATE_TYPE_P (type
))
3474 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3476 if (TREE_CODE (type
) == ARRAY_TYPE
)
3477 return TYPE_ALIGN (TREE_TYPE (type
));
3479 unsigned int alignment
= 0;
3480 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3481 if (TREE_CODE (field
) == FIELD_DECL
)
3482 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3487 /* Layout a function argument according to the AAPCS64 rules. The rule
3488 numbers refer to the rule numbers in the AAPCS64. */
3491 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3493 bool named ATTRIBUTE_UNUSED
)
3495 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3496 int ncrn
, nvrn
, nregs
;
3497 bool allocate_ncrn
, allocate_nvrn
;
3500 /* We need to do this once per argument. */
3501 if (pcum
->aapcs_arg_processed
)
3504 pcum
->aapcs_arg_processed
= true;
3506 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3508 size
= int_size_in_bytes (type
);
3510 /* No frontends can create types with variable-sized modes, so we
3511 shouldn't be asked to pass or return them. */
3512 size
= GET_MODE_SIZE (mode
).to_constant ();
3513 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3515 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3516 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3521 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3522 The following code thus handles passing by SIMD/FP registers first. */
3524 nvrn
= pcum
->aapcs_nvrn
;
3526 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3527 and homogenous short-vector aggregates (HVA). */
3531 aarch64_err_no_fpadvsimd (mode
);
3533 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3535 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3536 if (!aarch64_composite_type_p (type
, mode
))
3538 gcc_assert (nregs
== 1);
3539 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3545 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3546 for (i
= 0; i
< nregs
; i
++)
3548 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3549 V0_REGNUM
+ nvrn
+ i
);
3550 rtx offset
= gen_int_mode
3551 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3552 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3553 XVECEXP (par
, 0, i
) = tmp
;
3555 pcum
->aapcs_reg
= par
;
3561 /* C.3 NSRN is set to 8. */
3562 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3567 ncrn
= pcum
->aapcs_ncrn
;
3568 nregs
= size
/ UNITS_PER_WORD
;
3570 /* C6 - C9. though the sign and zero extension semantics are
3571 handled elsewhere. This is the case where the argument fits
3572 entirely general registers. */
3573 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3576 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3578 /* C.8 if the argument has an alignment of 16 then the NGRN is
3579 rounded up to the next even number. */
3582 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3583 comparison is there because for > 16 * BITS_PER_UNIT
3584 alignment nregs should be > 2 and therefore it should be
3585 passed by reference rather than value. */
3586 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3589 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3592 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3593 A reg is still generated for it, but the caller should be smart
3594 enough not to use it. */
3595 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3596 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3602 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3603 for (i
= 0; i
< nregs
; i
++)
3605 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3606 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3607 GEN_INT (i
* UNITS_PER_WORD
));
3608 XVECEXP (par
, 0, i
) = tmp
;
3610 pcum
->aapcs_reg
= par
;
3613 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3618 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3620 /* The argument is passed on stack; record the needed number of words for
3621 this argument and align the total size if necessary. */
3623 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3625 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3626 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3627 16 / UNITS_PER_WORD
);
3631 /* Implement TARGET_FUNCTION_ARG. */
3634 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3635 const_tree type
, bool named
)
3637 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3638 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3640 if (mode
== VOIDmode
)
3643 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3644 return pcum
->aapcs_reg
;
3648 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3649 const_tree fntype ATTRIBUTE_UNUSED
,
3650 rtx libname ATTRIBUTE_UNUSED
,
3651 const_tree fndecl ATTRIBUTE_UNUSED
,
3652 unsigned n_named ATTRIBUTE_UNUSED
)
3654 pcum
->aapcs_ncrn
= 0;
3655 pcum
->aapcs_nvrn
= 0;
3656 pcum
->aapcs_nextncrn
= 0;
3657 pcum
->aapcs_nextnvrn
= 0;
3658 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3659 pcum
->aapcs_reg
= NULL_RTX
;
3660 pcum
->aapcs_arg_processed
= false;
3661 pcum
->aapcs_stack_words
= 0;
3662 pcum
->aapcs_stack_size
= 0;
3665 && fndecl
&& TREE_PUBLIC (fndecl
)
3666 && fntype
&& fntype
!= error_mark_node
)
3668 const_tree type
= TREE_TYPE (fntype
);
3669 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3670 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3671 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3672 &mode
, &nregs
, NULL
))
3673 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3679 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3684 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3685 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3687 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3688 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3689 != (pcum
->aapcs_stack_words
!= 0));
3690 pcum
->aapcs_arg_processed
= false;
3691 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3692 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3693 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3694 pcum
->aapcs_stack_words
= 0;
3695 pcum
->aapcs_reg
= NULL_RTX
;
3700 aarch64_function_arg_regno_p (unsigned regno
)
3702 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3703 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3706 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3707 PARM_BOUNDARY bits of alignment, but will be given anything up
3708 to STACK_BOUNDARY bits if the type requires it. This makes sure
3709 that both before and after the layout of each argument, the Next
3710 Stacked Argument Address (NSAA) will have a minimum alignment of
3714 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3716 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3717 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3720 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3722 static fixed_size_mode
3723 aarch64_get_reg_raw_mode (int regno
)
3725 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3726 /* Don't use the SVE part of the register for __builtin_apply and
3727 __builtin_return. The SVE registers aren't used by the normal PCS,
3728 so using them there would be a waste of time. The PCS extensions
3729 for SVE types are fundamentally incompatible with the
3730 __builtin_return/__builtin_apply interface. */
3731 return as_a
<fixed_size_mode
> (V16QImode
);
3732 return default_get_reg_raw_mode (regno
);
3735 /* Implement TARGET_FUNCTION_ARG_PADDING.
3737 Small aggregate types are placed in the lowest memory address.
3739 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3741 static pad_direction
3742 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3744 /* On little-endian targets, the least significant byte of every stack
3745 argument is passed at the lowest byte address of the stack slot. */
3746 if (!BYTES_BIG_ENDIAN
)
3749 /* Otherwise, integral, floating-point and pointer types are padded downward:
3750 the least significant byte of a stack argument is passed at the highest
3751 byte address of the stack slot. */
3753 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3754 || POINTER_TYPE_P (type
))
3755 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3756 return PAD_DOWNWARD
;
3758 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3762 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3764 It specifies padding for the last (may also be the only)
3765 element of a block move between registers and memory. If
3766 assuming the block is in the memory, padding upward means that
3767 the last element is padded after its highest significant byte,
3768 while in downward padding, the last element is padded at the
3769 its least significant byte side.
3771 Small aggregates and small complex types are always padded
3774 We don't need to worry about homogeneous floating-point or
3775 short-vector aggregates; their move is not affected by the
3776 padding direction determined here. Regardless of endianness,
3777 each element of such an aggregate is put in the least
3778 significant bits of a fp/simd register.
3780 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3781 register has useful data, and return the opposite if the most
3782 significant byte does. */
3785 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3786 bool first ATTRIBUTE_UNUSED
)
3789 /* Small composite types are always padded upward. */
3790 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3794 size
= int_size_in_bytes (type
);
3796 /* No frontends can create types with variable-sized modes, so we
3797 shouldn't be asked to pass or return them. */
3798 size
= GET_MODE_SIZE (mode
).to_constant ();
3799 if (size
< 2 * UNITS_PER_WORD
)
3803 /* Otherwise, use the default padding. */
3804 return !BYTES_BIG_ENDIAN
;
3807 static scalar_int_mode
3808 aarch64_libgcc_cmp_return_mode (void)
3813 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3815 /* We use the 12-bit shifted immediate arithmetic instructions so values
3816 must be multiple of (1 << 12), i.e. 4096. */
3817 #define ARITH_FACTOR 4096
3819 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3820 #error Cannot use simple address calculation for stack probing
3823 /* The pair of scratch registers used for stack probing. */
3824 #define PROBE_STACK_FIRST_REG 9
3825 #define PROBE_STACK_SECOND_REG 10
3827 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3828 inclusive. These are offsets from the current stack pointer. */
3831 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3834 if (!poly_size
.is_constant (&size
))
3836 sorry ("stack probes for SVE frames");
3840 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3842 /* See the same assertion on PROBE_INTERVAL above. */
3843 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3845 /* See if we have a constant small number of probes to generate. If so,
3846 that's the easy case. */
3847 if (size
<= PROBE_INTERVAL
)
3849 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3851 emit_set_insn (reg1
,
3852 plus_constant (Pmode
,
3853 stack_pointer_rtx
, -(first
+ base
)));
3854 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3857 /* The run-time loop is made up of 8 insns in the generic case while the
3858 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3859 else if (size
<= 4 * PROBE_INTERVAL
)
3861 HOST_WIDE_INT i
, rem
;
3863 emit_set_insn (reg1
,
3864 plus_constant (Pmode
,
3866 -(first
+ PROBE_INTERVAL
)));
3867 emit_stack_probe (reg1
);
3869 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3870 it exceeds SIZE. If only two probes are needed, this will not
3871 generate any code. Then probe at FIRST + SIZE. */
3872 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3874 emit_set_insn (reg1
,
3875 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3876 emit_stack_probe (reg1
);
3879 rem
= size
- (i
- PROBE_INTERVAL
);
3882 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3884 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3885 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3888 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3891 /* Otherwise, do the same as above, but in a loop. Note that we must be
3892 extra careful with variables wrapping around because we might be at
3893 the very top (or the very bottom) of the address space and we have
3894 to be able to handle this case properly; in particular, we use an
3895 equality test for the loop condition. */
3898 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3900 /* Step 1: round SIZE to the previous multiple of the interval. */
3902 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3905 /* Step 2: compute initial and final value of the loop counter. */
3907 /* TEST_ADDR = SP + FIRST. */
3908 emit_set_insn (reg1
,
3909 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3911 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3912 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3913 if (! aarch64_uimm12_shift (adjustment
))
3915 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3917 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3920 emit_set_insn (reg2
,
3921 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3927 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3930 while (TEST_ADDR != LAST_ADDR)
3932 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3933 until it is equal to ROUNDED_SIZE. */
3935 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3938 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3939 that SIZE is equal to ROUNDED_SIZE. */
3941 if (size
!= rounded_size
)
3943 HOST_WIDE_INT rem
= size
- rounded_size
;
3947 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3949 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3950 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3953 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3957 /* Make sure nothing is scheduled before we are done. */
3958 emit_insn (gen_blockage ());
3961 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3962 absolute addresses. */
3965 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3967 static int labelno
= 0;
3971 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3974 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3976 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3978 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3979 output_asm_insn ("sub\t%0, %0, %1", xops
);
3981 /* Probe at TEST_ADDR. */
3982 output_asm_insn ("str\txzr, [%0]", xops
);
3984 /* Test if TEST_ADDR == LAST_ADDR. */
3986 output_asm_insn ("cmp\t%0, %1", xops
);
3989 fputs ("\tb.ne\t", asm_out_file
);
3990 assemble_name_raw (asm_out_file
, loop_lab
);
3991 fputc ('\n', asm_out_file
);
3996 /* Determine whether a frame chain needs to be generated. */
3998 aarch64_needs_frame_chain (void)
4000 /* Force a frame chain for EH returns so the return address is at FP+8. */
4001 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
4004 /* A leaf function cannot have calls or write LR. */
4005 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
4007 /* Don't use a frame chain in leaf functions if leaf frame pointers
4009 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
4012 return aarch64_use_frame_pointer
;
4015 /* Mark the registers that need to be saved by the callee and calculate
4016 the size of the callee-saved registers area and frame record (both FP
4017 and LR may be omitted). */
4019 aarch64_layout_frame (void)
4021 HOST_WIDE_INT offset
= 0;
4022 int regno
, last_fp_reg
= INVALID_REGNUM
;
4024 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
4027 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
4029 #define SLOT_NOT_REQUIRED (-2)
4030 #define SLOT_REQUIRED (-1)
4032 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
4033 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
4035 /* First mark all the registers that really need to be saved... */
4036 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4037 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4039 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4040 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
4042 /* ... that includes the eh data registers (if needed)... */
4043 if (crtl
->calls_eh_return
)
4044 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
4045 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
4048 /* ... and any callee saved register that dataflow says is live. */
4049 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4050 if (df_regs_ever_live_p (regno
)
4051 && (regno
== R30_REGNUM
4052 || !call_used_regs
[regno
]))
4053 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4055 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4056 if (df_regs_ever_live_p (regno
)
4057 && !call_used_regs
[regno
])
4059 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
4060 last_fp_reg
= regno
;
4063 if (cfun
->machine
->frame
.emit_frame_chain
)
4065 /* FP and LR are placed in the linkage record. */
4066 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4067 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4068 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4069 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4070 offset
= 2 * UNITS_PER_WORD
;
4073 /* Now assign stack slots for them. */
4074 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4075 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4077 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4078 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4079 cfun
->machine
->frame
.wb_candidate1
= regno
;
4080 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4081 cfun
->machine
->frame
.wb_candidate2
= regno
;
4082 offset
+= UNITS_PER_WORD
;
4085 HOST_WIDE_INT max_int_offset
= offset
;
4086 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4087 bool has_align_gap
= offset
!= max_int_offset
;
4089 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4090 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4092 /* If there is an alignment gap between integer and fp callee-saves,
4093 allocate the last fp register to it if possible. */
4094 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
4096 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4100 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4101 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4102 cfun
->machine
->frame
.wb_candidate1
= regno
;
4103 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4104 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4105 cfun
->machine
->frame
.wb_candidate2
= regno
;
4106 offset
+= UNITS_PER_WORD
;
4109 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4111 cfun
->machine
->frame
.saved_regs_size
= offset
;
4113 HOST_WIDE_INT varargs_and_saved_regs_size
4114 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4116 cfun
->machine
->frame
.hard_fp_offset
4117 = aligned_upper_bound (varargs_and_saved_regs_size
4118 + get_frame_size (),
4119 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4121 /* Both these values are already aligned. */
4122 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4123 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4124 cfun
->machine
->frame
.frame_size
4125 = (cfun
->machine
->frame
.hard_fp_offset
4126 + crtl
->outgoing_args_size
);
4128 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4130 cfun
->machine
->frame
.initial_adjust
= 0;
4131 cfun
->machine
->frame
.final_adjust
= 0;
4132 cfun
->machine
->frame
.callee_adjust
= 0;
4133 cfun
->machine
->frame
.callee_offset
= 0;
4135 HOST_WIDE_INT max_push_offset
= 0;
4136 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4137 max_push_offset
= 512;
4138 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4139 max_push_offset
= 256;
4141 HOST_WIDE_INT const_size
, const_fp_offset
;
4142 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4143 && const_size
< max_push_offset
4144 && known_eq (crtl
->outgoing_args_size
, 0))
4146 /* Simple, small frame with no outgoing arguments:
4147 stp reg1, reg2, [sp, -frame_size]!
4148 stp reg3, reg4, [sp, 16] */
4149 cfun
->machine
->frame
.callee_adjust
= const_size
;
4151 else if (known_lt (crtl
->outgoing_args_size
4152 + cfun
->machine
->frame
.saved_regs_size
, 512)
4153 && !(cfun
->calls_alloca
4154 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4157 /* Frame with small outgoing arguments:
4158 sub sp, sp, frame_size
4159 stp reg1, reg2, [sp, outgoing_args_size]
4160 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4161 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4162 cfun
->machine
->frame
.callee_offset
4163 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4165 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4166 && const_fp_offset
< max_push_offset
)
4168 /* Frame with large outgoing arguments but a small local area:
4169 stp reg1, reg2, [sp, -hard_fp_offset]!
4170 stp reg3, reg4, [sp, 16]
4171 sub sp, sp, outgoing_args_size */
4172 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4173 cfun
->machine
->frame
.final_adjust
4174 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4178 /* Frame with large local area and outgoing arguments using frame pointer:
4179 sub sp, sp, hard_fp_offset
4180 stp x29, x30, [sp, 0]
4182 stp reg3, reg4, [sp, 16]
4183 sub sp, sp, outgoing_args_size */
4184 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4185 cfun
->machine
->frame
.final_adjust
4186 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4189 cfun
->machine
->frame
.laid_out
= true;
4192 /* Return true if the register REGNO is saved on entry to
4193 the current function. */
4196 aarch64_register_saved_on_entry (int regno
)
4198 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4201 /* Return the next register up from REGNO up to LIMIT for the callee
4205 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4207 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4212 /* Push the register number REGNO of mode MODE to the stack with write-back
4213 adjusting the stack by ADJUSTMENT. */
4216 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4217 HOST_WIDE_INT adjustment
)
4219 rtx base_rtx
= stack_pointer_rtx
;
4222 reg
= gen_rtx_REG (mode
, regno
);
4223 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4224 plus_constant (Pmode
, base_rtx
, -adjustment
));
4225 mem
= gen_frame_mem (mode
, mem
);
4227 insn
= emit_move_insn (mem
, reg
);
4228 RTX_FRAME_RELATED_P (insn
) = 1;
4231 /* Generate and return an instruction to store the pair of registers
4232 REG and REG2 of mode MODE to location BASE with write-back adjusting
4233 the stack location BASE by ADJUSTMENT. */
4236 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4237 HOST_WIDE_INT adjustment
)
4242 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4243 GEN_INT (-adjustment
),
4244 GEN_INT (UNITS_PER_WORD
- adjustment
));
4246 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4247 GEN_INT (-adjustment
),
4248 GEN_INT (UNITS_PER_WORD
- adjustment
));
4254 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4255 stack pointer by ADJUSTMENT. */
4258 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4261 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4263 if (regno2
== INVALID_REGNUM
)
4264 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4266 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4267 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4269 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4271 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4272 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4273 RTX_FRAME_RELATED_P (insn
) = 1;
4276 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4277 adjusting it by ADJUSTMENT afterwards. */
4280 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4281 HOST_WIDE_INT adjustment
)
4286 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4287 GEN_INT (UNITS_PER_WORD
));
4289 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4290 GEN_INT (UNITS_PER_WORD
));
4296 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4297 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4301 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4304 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4305 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4307 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4309 if (regno2
== INVALID_REGNUM
)
4311 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4312 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4313 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4317 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4318 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4319 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4324 /* Generate and return a store pair instruction of mode MODE to store
4325 register REG1 to MEM1 and register REG2 to MEM2. */
4328 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4334 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4337 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4344 /* Generate and regurn a load pair isntruction of mode MODE to load register
4345 REG1 from MEM1 and register REG2 from MEM2. */
4348 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4354 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4357 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4364 /* Return TRUE if return address signing should be enabled for the current
4365 function, otherwise return FALSE. */
4368 aarch64_return_address_signing_enabled (void)
4370 /* This function should only be called after frame laid out. */
4371 gcc_assert (cfun
->machine
->frame
.laid_out
);
4373 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4374 if it's LR is pushed onto stack. */
4375 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4376 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4377 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4380 /* Emit code to save the callee-saved registers from register number START
4381 to LIMIT to the stack at the location starting at offset START_OFFSET,
4382 skipping any write-back candidates if SKIP_WB is true. */
4385 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4386 unsigned start
, unsigned limit
, bool skip_wb
)
4392 for (regno
= aarch64_next_callee_save (start
, limit
);
4394 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4400 && (regno
== cfun
->machine
->frame
.wb_candidate1
4401 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4404 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4407 reg
= gen_rtx_REG (mode
, regno
);
4408 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4409 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4412 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4415 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4416 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4417 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4420 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4423 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4424 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4426 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4429 /* The first part of a frame-related parallel insn is
4430 always assumed to be relevant to the frame
4431 calculations; subsequent parts, are only
4432 frame-related if explicitly marked. */
4433 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4437 insn
= emit_move_insn (mem
, reg
);
4439 RTX_FRAME_RELATED_P (insn
) = 1;
4443 /* Emit code to restore the callee registers of mode MODE from register
4444 number START up to and including LIMIT. Restore from the stack offset
4445 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4446 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4449 aarch64_restore_callee_saves (machine_mode mode
,
4450 poly_int64 start_offset
, unsigned start
,
4451 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4453 rtx base_rtx
= stack_pointer_rtx
;
4458 for (regno
= aarch64_next_callee_save (start
, limit
);
4460 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4462 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4468 && (regno
== cfun
->machine
->frame
.wb_candidate1
4469 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4472 reg
= gen_rtx_REG (mode
, regno
);
4473 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4474 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4476 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4479 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4480 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4481 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4483 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4486 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4487 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4488 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4490 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4494 emit_move_insn (reg
, mem
);
4495 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4499 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4503 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4505 HOST_WIDE_INT multiple
;
4506 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4507 && IN_RANGE (multiple
, -8, 7));
4510 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4514 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4516 HOST_WIDE_INT multiple
;
4517 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4518 && IN_RANGE (multiple
, 0, 63));
4521 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4525 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4527 HOST_WIDE_INT multiple
;
4528 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4529 && IN_RANGE (multiple
, -64, 63));
4532 /* Return true if OFFSET is a signed 9-bit value. */
4535 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4538 HOST_WIDE_INT const_offset
;
4539 return (offset
.is_constant (&const_offset
)
4540 && IN_RANGE (const_offset
, -256, 255));
4543 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4547 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4549 HOST_WIDE_INT multiple
;
4550 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4551 && IN_RANGE (multiple
, -256, 255));
4554 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4558 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4560 HOST_WIDE_INT multiple
;
4561 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4562 && IN_RANGE (multiple
, 0, 4095));
4565 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4568 aarch64_get_separate_components (void)
4570 aarch64_layout_frame ();
4572 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4573 bitmap_clear (components
);
4575 /* The registers we need saved to the frame. */
4576 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4577 if (aarch64_register_saved_on_entry (regno
))
4579 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4580 if (!frame_pointer_needed
)
4581 offset
+= cfun
->machine
->frame
.frame_size
4582 - cfun
->machine
->frame
.hard_fp_offset
;
4583 /* Check that we can access the stack slot of the register with one
4584 direct load with no adjustments needed. */
4585 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4586 bitmap_set_bit (components
, regno
);
4589 /* Don't mess with the hard frame pointer. */
4590 if (frame_pointer_needed
)
4591 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4593 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4594 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4595 /* If aarch64_layout_frame has chosen registers to store/restore with
4596 writeback don't interfere with them to avoid having to output explicit
4597 stack adjustment instructions. */
4598 if (reg2
!= INVALID_REGNUM
)
4599 bitmap_clear_bit (components
, reg2
);
4600 if (reg1
!= INVALID_REGNUM
)
4601 bitmap_clear_bit (components
, reg1
);
4603 bitmap_clear_bit (components
, LR_REGNUM
);
4604 bitmap_clear_bit (components
, SP_REGNUM
);
4609 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4612 aarch64_components_for_bb (basic_block bb
)
4614 bitmap in
= DF_LIVE_IN (bb
);
4615 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4616 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4618 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4619 bitmap_clear (components
);
4621 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4622 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4623 if ((!call_used_regs
[regno
])
4624 && (bitmap_bit_p (in
, regno
)
4625 || bitmap_bit_p (gen
, regno
)
4626 || bitmap_bit_p (kill
, regno
)))
4628 unsigned regno2
, offset
, offset2
;
4629 bitmap_set_bit (components
, regno
);
4631 /* If there is a callee-save at an adjacent offset, add it too
4632 to increase the use of LDP/STP. */
4633 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4634 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
4636 if (regno2
<= LAST_SAVED_REGNUM
)
4638 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4639 if ((offset
& ~8) == (offset2
& ~8))
4640 bitmap_set_bit (components
, regno2
);
4647 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4648 Nothing to do for aarch64. */
4651 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4655 /* Return the next set bit in BMP from START onwards. Return the total number
4656 of bits in BMP if no set bit is found at or after START. */
4659 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4661 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4665 gcc_assert (start
< nbits
);
4666 for (unsigned int i
= start
; i
< nbits
; i
++)
4667 if (bitmap_bit_p (bmp
, i
))
4673 /* Do the work for aarch64_emit_prologue_components and
4674 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4675 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4676 for these components or the epilogue sequence. That is, it determines
4677 whether we should emit stores or loads and what kind of CFA notes to attach
4678 to the insns. Otherwise the logic for the two sequences is very
4682 aarch64_process_components (sbitmap components
, bool prologue_p
)
4684 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4685 ? HARD_FRAME_POINTER_REGNUM
4686 : STACK_POINTER_REGNUM
);
4688 unsigned last_regno
= SBITMAP_SIZE (components
);
4689 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4690 rtx_insn
*insn
= NULL
;
4692 while (regno
!= last_regno
)
4694 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4695 so DFmode for the vector registers is enough. */
4696 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4697 rtx reg
= gen_rtx_REG (mode
, regno
);
4698 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4699 if (!frame_pointer_needed
)
4700 offset
+= cfun
->machine
->frame
.frame_size
4701 - cfun
->machine
->frame
.hard_fp_offset
;
4702 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4703 rtx mem
= gen_frame_mem (mode
, addr
);
4705 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4706 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4707 /* No more registers to handle after REGNO.
4708 Emit a single save/restore and exit. */
4709 if (regno2
== last_regno
)
4711 insn
= emit_insn (set
);
4712 RTX_FRAME_RELATED_P (insn
) = 1;
4714 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4716 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4720 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4721 /* The next register is not of the same class or its offset is not
4722 mergeable with the current one into a pair. */
4723 if (!satisfies_constraint_Ump (mem
)
4724 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4725 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4726 GET_MODE_SIZE (mode
)))
4728 insn
= emit_insn (set
);
4729 RTX_FRAME_RELATED_P (insn
) = 1;
4731 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4733 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4739 /* REGNO2 can be saved/restored in a pair with REGNO. */
4740 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4741 if (!frame_pointer_needed
)
4742 offset2
+= cfun
->machine
->frame
.frame_size
4743 - cfun
->machine
->frame
.hard_fp_offset
;
4744 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4745 rtx mem2
= gen_frame_mem (mode
, addr2
);
4746 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4747 : gen_rtx_SET (reg2
, mem2
);
4750 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4752 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4754 RTX_FRAME_RELATED_P (insn
) = 1;
4757 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4758 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4762 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4763 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4766 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4770 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4773 aarch64_emit_prologue_components (sbitmap components
)
4775 aarch64_process_components (components
, true);
4778 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4781 aarch64_emit_epilogue_components (sbitmap components
)
4783 aarch64_process_components (components
, false);
4786 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4789 aarch64_set_handled_components (sbitmap components
)
4791 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4792 if (bitmap_bit_p (components
, regno
))
4793 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4796 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4797 is saved at BASE + OFFSET. */
4800 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4801 rtx base
, poly_int64 offset
)
4803 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4804 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4805 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4808 /* AArch64 stack frames generated by this compiler look like:
4810 +-------------------------------+
4812 | incoming stack arguments |
4814 +-------------------------------+
4815 | | <-- incoming stack pointer (aligned)
4816 | callee-allocated save area |
4817 | for register varargs |
4819 +-------------------------------+
4820 | local variables | <-- frame_pointer_rtx
4822 +-------------------------------+
4824 +-------------------------------+ |
4825 | callee-saved registers | | frame.saved_regs_size
4826 +-------------------------------+ |
4828 +-------------------------------+ |
4829 | FP' | / <- hard_frame_pointer_rtx (aligned)
4830 +-------------------------------+
4831 | dynamic allocation |
4832 +-------------------------------+
4834 +-------------------------------+
4835 | outgoing stack arguments | <-- arg_pointer
4837 +-------------------------------+
4838 | | <-- stack_pointer_rtx (aligned)
4840 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4841 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4844 /* Generate the prologue instructions for entry into a function.
4845 Establish the stack frame by decreasing the stack pointer with a
4846 properly calculated size and, if necessary, create a frame record
4847 filled with the values of LR and previous frame pointer. The
4848 current FP is also set up if it is in use. */
4851 aarch64_expand_prologue (void)
4853 aarch64_layout_frame ();
4855 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4856 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4857 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4858 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4859 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4860 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4861 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4862 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4865 /* Sign return address for functions. */
4866 if (aarch64_return_address_signing_enabled ())
4868 insn
= emit_insn (gen_pacisp ());
4869 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4870 RTX_FRAME_RELATED_P (insn
) = 1;
4873 if (flag_stack_usage_info
)
4874 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4876 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4878 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4880 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4881 && maybe_gt (frame_size
, get_stack_check_protect ()))
4882 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4884 - get_stack_check_protect ()));
4886 else if (maybe_gt (frame_size
, 0))
4887 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4890 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4891 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4893 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4895 if (callee_adjust
!= 0)
4896 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4898 if (emit_frame_chain
)
4900 poly_int64 reg_offset
= callee_adjust
;
4901 if (callee_adjust
== 0)
4905 reg_offset
= callee_offset
;
4906 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4908 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4909 stack_pointer_rtx
, callee_offset
,
4910 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4911 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4913 /* Variable-sized frames need to describe the save slot
4914 address using DW_CFA_expression rather than DW_CFA_offset.
4915 This means that, without taking further action, the
4916 locations of the registers that we've already saved would
4917 remain based on the stack pointer even after we redefine
4918 the CFA based on the frame pointer. We therefore need new
4919 DW_CFA_expressions to re-express the save slots with addresses
4920 based on the frame pointer. */
4921 rtx_insn
*insn
= get_last_insn ();
4922 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4924 /* Add an explicit CFA definition if this was previously
4926 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4928 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4930 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4931 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4934 /* Change the save slot expressions for the registers that
4935 we've already saved. */
4936 reg_offset
-= callee_offset
;
4937 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4938 reg_offset
+ UNITS_PER_WORD
);
4939 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4942 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4945 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4946 callee_adjust
!= 0 || emit_frame_chain
);
4947 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4948 callee_adjust
!= 0 || emit_frame_chain
);
4949 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4952 /* Return TRUE if we can use a simple_return insn.
4954 This function checks whether the callee saved stack is empty, which
4955 means no restore actions are need. The pro_and_epilogue will use
4956 this to check whether shrink-wrapping opt is feasible. */
4959 aarch64_use_return_insn_p (void)
4961 if (!reload_completed
)
4967 aarch64_layout_frame ();
4969 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4972 /* Generate the epilogue instructions for returning from a function.
4973 This is almost exactly the reverse of the prolog sequence, except
4974 that we need to insert barriers to avoid scheduling loads that read
4975 from a deallocated stack, and we optimize the unwind records by
4976 emitting them all together if possible. */
4978 aarch64_expand_epilogue (bool for_sibcall
)
4980 aarch64_layout_frame ();
4982 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4983 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4984 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4985 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4986 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4987 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4990 /* A stack clash protection prologue may not have left IP0_REGNUM or
4991 IP1_REGNUM in a usable state. The same is true for allocations
4992 with an SVE component, since we then need both temporary registers
4993 for each allocation. */
4994 bool can_inherit_p
= (initial_adjust
.is_constant ()
4995 && final_adjust
.is_constant ()
4996 && !flag_stack_clash_protection
);
4998 /* We need to add memory barrier to prevent read from deallocated stack. */
5000 = maybe_ne (get_frame_size ()
5001 + cfun
->machine
->frame
.saved_varargs_size
, 0);
5003 /* Emit a barrier to prevent loads from a deallocated stack. */
5004 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
5005 || cfun
->calls_alloca
5006 || crtl
->calls_eh_return
)
5008 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5009 need_barrier_p
= false;
5012 /* Restore the stack pointer from the frame pointer if it may not
5013 be the same as the stack pointer. */
5014 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5015 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5016 if (frame_pointer_needed
5017 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
5018 /* If writeback is used when restoring callee-saves, the CFA
5019 is restored on the instruction doing the writeback. */
5020 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
5021 hard_frame_pointer_rtx
, -callee_offset
,
5022 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
5024 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
5025 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
5027 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
5028 callee_adjust
!= 0, &cfi_ops
);
5029 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
5030 callee_adjust
!= 0, &cfi_ops
);
5033 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
5035 if (callee_adjust
!= 0)
5036 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
5038 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
5040 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5041 insn
= get_last_insn ();
5042 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
5043 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
5044 RTX_FRAME_RELATED_P (insn
) = 1;
5048 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
5049 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
5053 /* Emit delayed restores and reset the CFA to be SP. */
5054 insn
= get_last_insn ();
5055 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
5056 REG_NOTES (insn
) = cfi_ops
;
5057 RTX_FRAME_RELATED_P (insn
) = 1;
5060 /* We prefer to emit the combined return/authenticate instruction RETAA,
5061 however there are three cases in which we must instead emit an explicit
5062 authentication instruction.
5064 1) Sibcalls don't return in a normal way, so if we're about to call one
5065 we must authenticate.
5067 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5068 generating code for !TARGET_ARMV8_3 we can't use it and must
5069 explicitly authenticate.
5071 3) On an eh_return path we make extra stack adjustments to update the
5072 canonical frame address to be the exception handler's CFA. We want
5073 to authenticate using the CFA of the function which calls eh_return.
5075 if (aarch64_return_address_signing_enabled ()
5076 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5078 insn
= emit_insn (gen_autisp ());
5079 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5080 RTX_FRAME_RELATED_P (insn
) = 1;
5083 /* Stack adjustment for exception handler. */
5084 if (crtl
->calls_eh_return
)
5086 /* We need to unwind the stack by the offset computed by
5087 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5088 to be SP; letting the CFA move during this adjustment
5089 is just as correct as retaining the CFA from the body
5090 of the function. Therefore, do nothing special. */
5091 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5094 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5096 emit_jump_insn (ret_rtx
);
5099 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5100 normally or return to a previous frame after unwinding.
5102 An EH return uses a single shared return sequence. The epilogue is
5103 exactly like a normal epilogue except that it has an extra input
5104 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5105 that must be applied after the frame has been destroyed. An extra label
5106 is inserted before the epilogue which initializes this register to zero,
5107 and this is the entry point for a normal return.
5109 An actual EH return updates the return address, initializes the stack
5110 adjustment and jumps directly into the epilogue (bypassing the zeroing
5111 of the adjustment). Since the return address is typically saved on the
5112 stack when a function makes a call, the saved LR must be updated outside
5115 This poses problems as the store is generated well before the epilogue,
5116 so the offset of LR is not known yet. Also optimizations will remove the
5117 store as it appears dead, even after the epilogue is generated (as the
5118 base or offset for loading LR is different in many cases).
5120 To avoid these problems this implementation forces the frame pointer
5121 in eh_return functions so that the location of LR is fixed and known early.
5122 It also marks the store volatile, so no optimization is permitted to
5123 remove the store. */
5125 aarch64_eh_return_handler_rtx (void)
5127 rtx tmp
= gen_frame_mem (Pmode
,
5128 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5130 /* Mark the store volatile, so no optimization is permitted to remove it. */
5131 MEM_VOLATILE_P (tmp
) = true;
5135 /* Output code to add DELTA to the first argument, and then jump
5136 to FUNCTION. Used for C++ multiple inheritance. */
5138 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5139 HOST_WIDE_INT delta
,
5140 HOST_WIDE_INT vcall_offset
,
5143 /* The this pointer is always in x0. Note that this differs from
5144 Arm where the this pointer maybe bumped to r1 if r0 is required
5145 to return a pointer to an aggregate. On AArch64 a result value
5146 pointer will be in x8. */
5147 int this_regno
= R0_REGNUM
;
5148 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5151 reload_completed
= 1;
5152 emit_note (NOTE_INSN_PROLOGUE_END
);
5154 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5155 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5156 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5158 if (vcall_offset
== 0)
5159 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5162 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5167 if (delta
>= -256 && delta
< 256)
5168 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5169 plus_constant (Pmode
, this_rtx
, delta
));
5171 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5172 temp1
, temp0
, false);
5175 if (Pmode
== ptr_mode
)
5176 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5178 aarch64_emit_move (temp0
,
5179 gen_rtx_ZERO_EXTEND (Pmode
,
5180 gen_rtx_MEM (ptr_mode
, addr
)));
5182 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5183 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5186 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5188 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5191 if (Pmode
== ptr_mode
)
5192 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5194 aarch64_emit_move (temp1
,
5195 gen_rtx_SIGN_EXTEND (Pmode
,
5196 gen_rtx_MEM (ptr_mode
, addr
)));
5198 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5201 /* Generate a tail call to the target function. */
5202 if (!TREE_USED (function
))
5204 assemble_external (function
);
5205 TREE_USED (function
) = 1;
5207 funexp
= XEXP (DECL_RTL (function
), 0);
5208 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5209 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5210 SIBLING_CALL_P (insn
) = 1;
5212 insn
= get_insns ();
5213 shorten_branches (insn
);
5214 final_start_function (insn
, file
, 1);
5215 final (insn
, file
, 1);
5216 final_end_function ();
5218 /* Stop pretending to be a post-reload pass. */
5219 reload_completed
= 0;
5223 aarch64_tls_referenced_p (rtx x
)
5225 if (!TARGET_HAVE_TLS
)
5227 subrtx_iterator::array_type array
;
5228 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5230 const_rtx x
= *iter
;
5231 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5233 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5234 TLS offsets, not real symbol references. */
5235 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5236 iter
.skip_subrtxes ();
5242 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5243 a left shift of 0 or 12 bits. */
5245 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5247 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5248 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5253 /* Return true if val is an immediate that can be loaded into a
5254 register by a MOVZ instruction. */
5256 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5258 if (GET_MODE_SIZE (mode
) > 4)
5260 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5261 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5266 /* Ignore sign extension. */
5267 val
&= (HOST_WIDE_INT
) 0xffffffff;
5269 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5270 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5273 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5274 64-bit (DImode) integer. */
5276 static unsigned HOST_WIDE_INT
5277 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5279 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5282 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5289 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5291 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5293 0x0000000100000001ull
,
5294 0x0001000100010001ull
,
5295 0x0101010101010101ull
,
5296 0x1111111111111111ull
,
5297 0x5555555555555555ull
,
5301 /* Return true if val is a valid bitmask immediate. */
5304 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5306 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5309 /* Check for a single sequence of one bits and return quickly if so.
5310 The special cases of all ones and all zeroes returns false. */
5311 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5312 tmp
= val
+ (val
& -val
);
5314 if (tmp
== (tmp
& -tmp
))
5315 return (val
+ 1) > 1;
5317 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5319 val
= (val
<< 32) | (val
& 0xffffffff);
5321 /* Invert if the immediate doesn't start with a zero bit - this means we
5322 only need to search for sequences of one bits. */
5326 /* Find the first set bit and set tmp to val with the first sequence of one
5327 bits removed. Return success if there is a single sequence of ones. */
5328 first_one
= val
& -val
;
5329 tmp
= val
& (val
+ first_one
);
5334 /* Find the next set bit and compute the difference in bit position. */
5335 next_one
= tmp
& -tmp
;
5336 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5339 /* Check the bit position difference is a power of 2, and that the first
5340 sequence of one bits fits within 'bits' bits. */
5341 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5344 /* Check the sequence of one bits is repeated 64/bits times. */
5345 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5348 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5349 Assumed precondition: VAL_IN Is not zero. */
5351 unsigned HOST_WIDE_INT
5352 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5354 int lowest_bit_set
= ctz_hwi (val_in
);
5355 int highest_bit_set
= floor_log2 (val_in
);
5356 gcc_assert (val_in
!= 0);
5358 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5359 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5362 /* Create constant where bits outside of lowest bit set to highest bit set
5365 unsigned HOST_WIDE_INT
5366 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5368 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5371 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5374 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5376 scalar_int_mode int_mode
;
5377 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5380 if (aarch64_bitmask_imm (val_in
, int_mode
))
5383 if (aarch64_move_imm (val_in
, int_mode
))
5386 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5388 return aarch64_bitmask_imm (imm2
, int_mode
);
5391 /* Return true if val is an immediate that can be loaded into a
5392 register in a single instruction. */
5394 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5396 scalar_int_mode int_mode
;
5397 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5400 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5402 return aarch64_bitmask_imm (val
, int_mode
);
5406 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5410 if (GET_CODE (x
) == HIGH
)
5413 /* There's no way to calculate VL-based values using relocations. */
5414 subrtx_iterator::array_type array
;
5415 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5416 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5419 split_const (x
, &base
, &offset
);
5420 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5422 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5423 != SYMBOL_FORCE_TO_MEM
)
5426 /* Avoid generating a 64-bit relocation in ILP32; leave
5427 to aarch64_expand_mov_immediate to handle it properly. */
5428 return mode
!= ptr_mode
;
5431 return aarch64_tls_referenced_p (x
);
5434 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5435 The expansion for a table switch is quite expensive due to the number
5436 of instructions, the table lookup and hard to predict indirect jump.
5437 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5438 set, otherwise use tables for > 16 cases as a tradeoff between size and
5439 performance. When optimizing for size, use the default setting. */
5442 aarch64_case_values_threshold (void)
5444 /* Use the specified limit for the number of cases before using jump
5445 tables at higher optimization levels. */
5447 && selected_cpu
->tune
->max_case_values
!= 0)
5448 return selected_cpu
->tune
->max_case_values
;
5450 return optimize_size
? default_case_values_threshold () : 17;
5453 /* Return true if register REGNO is a valid index register.
5454 STRICT_P is true if REG_OK_STRICT is in effect. */
5457 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5459 if (!HARD_REGISTER_NUM_P (regno
))
5467 regno
= reg_renumber
[regno
];
5469 return GP_REGNUM_P (regno
);
5472 /* Return true if register REGNO is a valid base register for mode MODE.
5473 STRICT_P is true if REG_OK_STRICT is in effect. */
5476 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5478 if (!HARD_REGISTER_NUM_P (regno
))
5486 regno
= reg_renumber
[regno
];
5489 /* The fake registers will be eliminated to either the stack or
5490 hard frame pointer, both of which are usually valid base registers.
5491 Reload deals with the cases where the eliminated form isn't valid. */
5492 return (GP_REGNUM_P (regno
)
5493 || regno
== SP_REGNUM
5494 || regno
== FRAME_POINTER_REGNUM
5495 || regno
== ARG_POINTER_REGNUM
);
5498 /* Return true if X is a valid base register for mode MODE.
5499 STRICT_P is true if REG_OK_STRICT is in effect. */
5502 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5505 && GET_CODE (x
) == SUBREG
5506 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5509 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5512 /* Return true if address offset is a valid index. If it is, fill in INFO
5513 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5516 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5517 machine_mode mode
, bool strict_p
)
5519 enum aarch64_address_type type
;
5524 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5525 && GET_MODE (x
) == Pmode
)
5527 type
= ADDRESS_REG_REG
;
5531 /* (sign_extend:DI (reg:SI)) */
5532 else if ((GET_CODE (x
) == SIGN_EXTEND
5533 || GET_CODE (x
) == ZERO_EXTEND
)
5534 && GET_MODE (x
) == DImode
5535 && GET_MODE (XEXP (x
, 0)) == SImode
)
5537 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5538 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5539 index
= XEXP (x
, 0);
5542 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5543 else if (GET_CODE (x
) == MULT
5544 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5545 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5546 && GET_MODE (XEXP (x
, 0)) == DImode
5547 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5548 && CONST_INT_P (XEXP (x
, 1)))
5550 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5551 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5552 index
= XEXP (XEXP (x
, 0), 0);
5553 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5555 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5556 else if (GET_CODE (x
) == ASHIFT
5557 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5558 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5559 && GET_MODE (XEXP (x
, 0)) == DImode
5560 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5561 && CONST_INT_P (XEXP (x
, 1)))
5563 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5564 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5565 index
= XEXP (XEXP (x
, 0), 0);
5566 shift
= INTVAL (XEXP (x
, 1));
5568 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5569 else if ((GET_CODE (x
) == SIGN_EXTRACT
5570 || GET_CODE (x
) == ZERO_EXTRACT
)
5571 && GET_MODE (x
) == DImode
5572 && GET_CODE (XEXP (x
, 0)) == MULT
5573 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5574 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5576 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5577 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5578 index
= XEXP (XEXP (x
, 0), 0);
5579 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5580 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5581 || INTVAL (XEXP (x
, 2)) != 0)
5584 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5585 (const_int 0xffffffff<<shift)) */
5586 else if (GET_CODE (x
) == AND
5587 && GET_MODE (x
) == DImode
5588 && GET_CODE (XEXP (x
, 0)) == MULT
5589 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5590 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5591 && CONST_INT_P (XEXP (x
, 1)))
5593 type
= ADDRESS_REG_UXTW
;
5594 index
= XEXP (XEXP (x
, 0), 0);
5595 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5596 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5599 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5600 else if ((GET_CODE (x
) == SIGN_EXTRACT
5601 || GET_CODE (x
) == ZERO_EXTRACT
)
5602 && GET_MODE (x
) == DImode
5603 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5604 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5605 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5607 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5608 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5609 index
= XEXP (XEXP (x
, 0), 0);
5610 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5611 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5612 || INTVAL (XEXP (x
, 2)) != 0)
5615 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5616 (const_int 0xffffffff<<shift)) */
5617 else if (GET_CODE (x
) == AND
5618 && GET_MODE (x
) == DImode
5619 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5620 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5621 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5622 && CONST_INT_P (XEXP (x
, 1)))
5624 type
= ADDRESS_REG_UXTW
;
5625 index
= XEXP (XEXP (x
, 0), 0);
5626 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5627 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5630 /* (mult:P (reg:P) (const_int scale)) */
5631 else if (GET_CODE (x
) == MULT
5632 && GET_MODE (x
) == Pmode
5633 && GET_MODE (XEXP (x
, 0)) == Pmode
5634 && CONST_INT_P (XEXP (x
, 1)))
5636 type
= ADDRESS_REG_REG
;
5637 index
= XEXP (x
, 0);
5638 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5640 /* (ashift:P (reg:P) (const_int shift)) */
5641 else if (GET_CODE (x
) == ASHIFT
5642 && GET_MODE (x
) == Pmode
5643 && GET_MODE (XEXP (x
, 0)) == Pmode
5644 && CONST_INT_P (XEXP (x
, 1)))
5646 type
= ADDRESS_REG_REG
;
5647 index
= XEXP (x
, 0);
5648 shift
= INTVAL (XEXP (x
, 1));
5654 && GET_CODE (index
) == SUBREG
5655 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5656 index
= SUBREG_REG (index
);
5658 if (aarch64_sve_data_mode_p (mode
))
5660 if (type
!= ADDRESS_REG_REG
5661 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5667 && !(IN_RANGE (shift
, 1, 3)
5668 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5673 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5676 info
->offset
= index
;
5677 info
->shift
= shift
;
5684 /* Return true if MODE is one of the modes for which we
5685 support LDP/STP operations. */
5688 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5690 return mode
== SImode
|| mode
== DImode
5691 || mode
== SFmode
|| mode
== DFmode
5692 || (aarch64_vector_mode_supported_p (mode
)
5693 && (known_eq (GET_MODE_SIZE (mode
), 8)
5694 || (known_eq (GET_MODE_SIZE (mode
), 16)
5695 && (aarch64_tune_params
.extra_tuning_flags
5696 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
5699 /* Return true if REGNO is a virtual pointer register, or an eliminable
5700 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5701 include stack_pointer or hard_frame_pointer. */
5703 virt_or_elim_regno_p (unsigned regno
)
5705 return ((regno
>= FIRST_VIRTUAL_REGISTER
5706 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5707 || regno
== FRAME_POINTER_REGNUM
5708 || regno
== ARG_POINTER_REGNUM
);
5711 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5712 If it is, fill in INFO appropriately. STRICT_P is true if
5713 REG_OK_STRICT is in effect. */
5716 aarch64_classify_address (struct aarch64_address_info
*info
,
5717 rtx x
, machine_mode mode
, bool strict_p
,
5718 aarch64_addr_query_type type
= ADDR_QUERY_M
)
5720 enum rtx_code code
= GET_CODE (x
);
5724 HOST_WIDE_INT const_size
;
5726 /* On BE, we use load/store pair for all large int mode load/stores.
5727 TI/TFmode may also use a load/store pair. */
5728 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5729 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5730 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5733 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5735 bool allow_reg_index_p
= (!load_store_pair_p
5736 && (known_lt (GET_MODE_SIZE (mode
), 16)
5737 || vec_flags
== VEC_ADVSIMD
5738 || vec_flags
== VEC_SVE_DATA
));
5740 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5741 [Rn, #offset, MUL VL]. */
5742 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5743 && (code
!= REG
&& code
!= PLUS
))
5746 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5748 if (advsimd_struct_p
5749 && !BYTES_BIG_ENDIAN
5750 && (code
!= POST_INC
&& code
!= REG
))
5753 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5754 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5760 info
->type
= ADDRESS_REG_IMM
;
5762 info
->offset
= const0_rtx
;
5763 info
->const_offset
= 0;
5764 return aarch64_base_register_rtx_p (x
, strict_p
);
5772 && virt_or_elim_regno_p (REGNO (op0
))
5773 && poly_int_rtx_p (op1
, &offset
))
5775 info
->type
= ADDRESS_REG_IMM
;
5778 info
->const_offset
= offset
;
5783 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5784 && aarch64_base_register_rtx_p (op0
, strict_p
)
5785 && poly_int_rtx_p (op1
, &offset
))
5787 info
->type
= ADDRESS_REG_IMM
;
5790 info
->const_offset
= offset
;
5792 /* TImode and TFmode values are allowed in both pairs of X
5793 registers and individual Q registers. The available
5795 X,X: 7-bit signed scaled offset
5796 Q: 9-bit signed offset
5797 We conservatively require an offset representable in either mode.
5798 When performing the check for pairs of X registers i.e. LDP/STP
5799 pass down DImode since that is the natural size of the LDP/STP
5800 instruction memory accesses. */
5801 if (mode
== TImode
|| mode
== TFmode
)
5802 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5803 && (offset_9bit_signed_unscaled_p (mode
, offset
)
5804 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5806 /* A 7bit offset check because OImode will emit a ldp/stp
5807 instruction (only big endian will get here).
5808 For ldp/stp instructions, the offset is scaled for the size of a
5809 single element of the pair. */
5811 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5813 /* Three 9/12 bit offsets checks because CImode will emit three
5814 ldr/str instructions (only big endian will get here). */
5816 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5817 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
5818 || offset_12bit_unsigned_scaled_p (V16QImode
,
5821 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5822 instructions (only big endian will get here). */
5824 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5825 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5828 /* Make "m" use the LD1 offset range for SVE data modes, so
5829 that pre-RTL optimizers like ivopts will work to that
5830 instead of the wider LDR/STR range. */
5831 if (vec_flags
== VEC_SVE_DATA
)
5832 return (type
== ADDR_QUERY_M
5833 ? offset_4bit_signed_scaled_p (mode
, offset
)
5834 : offset_9bit_signed_scaled_p (mode
, offset
));
5836 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
5838 poly_int64 end_offset
= (offset
5839 + GET_MODE_SIZE (mode
)
5840 - BYTES_PER_SVE_VECTOR
);
5841 return (type
== ADDR_QUERY_M
5842 ? offset_4bit_signed_scaled_p (mode
, offset
)
5843 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
5844 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
5848 if (vec_flags
== VEC_SVE_PRED
)
5849 return offset_9bit_signed_scaled_p (mode
, offset
);
5851 if (load_store_pair_p
)
5852 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5853 || known_eq (GET_MODE_SIZE (mode
), 8)
5854 || known_eq (GET_MODE_SIZE (mode
), 16))
5855 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5857 return (offset_9bit_signed_unscaled_p (mode
, offset
)
5858 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5861 if (allow_reg_index_p
)
5863 /* Look for base + (scaled/extended) index register. */
5864 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5865 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5870 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5871 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5884 info
->type
= ADDRESS_REG_WB
;
5885 info
->base
= XEXP (x
, 0);
5886 info
->offset
= NULL_RTX
;
5887 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5891 info
->type
= ADDRESS_REG_WB
;
5892 info
->base
= XEXP (x
, 0);
5893 if (GET_CODE (XEXP (x
, 1)) == PLUS
5894 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5895 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5896 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5898 info
->offset
= XEXP (XEXP (x
, 1), 1);
5899 info
->const_offset
= offset
;
5901 /* TImode and TFmode values are allowed in both pairs of X
5902 registers and individual Q registers. The available
5904 X,X: 7-bit signed scaled offset
5905 Q: 9-bit signed offset
5906 We conservatively require an offset representable in either mode.
5908 if (mode
== TImode
|| mode
== TFmode
)
5909 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5910 && offset_9bit_signed_unscaled_p (mode
, offset
));
5912 if (load_store_pair_p
)
5913 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5914 || known_eq (GET_MODE_SIZE (mode
), 8)
5915 || known_eq (GET_MODE_SIZE (mode
), 16))
5916 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5918 return offset_9bit_signed_unscaled_p (mode
, offset
);
5925 /* load literal: pc-relative constant pool entry. Only supported
5926 for SI mode or larger. */
5927 info
->type
= ADDRESS_SYMBOLIC
;
5929 if (!load_store_pair_p
5930 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5935 split_const (x
, &sym
, &addend
);
5936 return ((GET_CODE (sym
) == LABEL_REF
5937 || (GET_CODE (sym
) == SYMBOL_REF
5938 && CONSTANT_POOL_ADDRESS_P (sym
)
5939 && aarch64_pcrelative_literal_loads
)));
5944 info
->type
= ADDRESS_LO_SUM
;
5945 info
->base
= XEXP (x
, 0);
5946 info
->offset
= XEXP (x
, 1);
5947 if (allow_reg_index_p
5948 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5951 split_const (info
->offset
, &sym
, &offs
);
5952 if (GET_CODE (sym
) == SYMBOL_REF
5953 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5954 == SYMBOL_SMALL_ABSOLUTE
))
5956 /* The symbol and offset must be aligned to the access size. */
5959 if (CONSTANT_POOL_ADDRESS_P (sym
))
5960 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5961 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5963 tree exp
= SYMBOL_REF_DECL (sym
);
5964 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5965 align
= aarch64_constant_alignment (exp
, align
);
5967 else if (SYMBOL_REF_DECL (sym
))
5968 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5969 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5970 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5971 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5973 align
= BITS_PER_UNIT
;
5975 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5976 if (known_eq (ref_size
, 0))
5977 ref_size
= GET_MODE_SIZE (DImode
);
5979 return (multiple_p (INTVAL (offs
), ref_size
)
5980 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
5990 /* Return true if the address X is valid for a PRFM instruction.
5991 STRICT_P is true if we should do strict checking with
5992 aarch64_classify_address. */
5995 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
5997 struct aarch64_address_info addr
;
5999 /* PRFM accepts the same addresses as DImode... */
6000 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
6004 /* ... except writeback forms. */
6005 return addr
.type
!= ADDRESS_REG_WB
;
6009 aarch64_symbolic_address_p (rtx x
)
6013 split_const (x
, &x
, &offset
);
6014 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
6017 /* Classify the base of symbolic expression X. */
6019 enum aarch64_symbol_type
6020 aarch64_classify_symbolic_expression (rtx x
)
6024 split_const (x
, &x
, &offset
);
6025 return aarch64_classify_symbol (x
, INTVAL (offset
));
6029 /* Return TRUE if X is a legitimate address for accessing memory in
6032 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
6034 struct aarch64_address_info addr
;
6036 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
6039 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6040 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6042 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
6043 aarch64_addr_query_type type
)
6045 struct aarch64_address_info addr
;
6047 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
6050 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6053 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6054 poly_int64 orig_offset
,
6058 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6060 HOST_WIDE_INT const_offset
, second_offset
;
6062 /* A general SVE offset is A * VQ + B. Remove the A component from
6063 coefficient 0 in order to get the constant B. */
6064 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6066 /* Split an out-of-range address displacement into a base and
6067 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6068 range otherwise to increase opportunities for sharing the base
6069 address of different sizes. Unaligned accesses use the signed
6070 9-bit range, TImode/TFmode use the intersection of signed
6071 scaled 7-bit and signed 9-bit offset. */
6072 if (mode
== TImode
|| mode
== TFmode
)
6073 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6074 else if ((const_offset
& (size
- 1)) != 0)
6075 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6077 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6079 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6082 /* Split the offset into second_offset and the rest. */
6083 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6084 *offset2
= gen_int_mode (second_offset
, Pmode
);
6089 /* Get the mode we should use as the basis of the range. For structure
6090 modes this is the mode of one vector. */
6091 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6092 machine_mode step_mode
6093 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6095 /* Get the "mul vl" multiplier we'd like to use. */
6096 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6097 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6098 if (vec_flags
& VEC_SVE_DATA
)
6099 /* LDR supports a 9-bit range, but the move patterns for
6100 structure modes require all vectors to be in range of the
6101 same base. The simplest way of accomodating that while still
6102 promoting reuse of anchor points between different modes is
6103 to use an 8-bit range unconditionally. */
6104 vnum
= ((vnum
+ 128) & 255) - 128;
6106 /* Predicates are only handled singly, so we might as well use
6108 vnum
= ((vnum
+ 256) & 511) - 256;
6112 /* Convert the "mul vl" multiplier into a byte offset. */
6113 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6114 if (known_eq (second_offset
, orig_offset
))
6117 /* Split the offset into second_offset and the rest. */
6118 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6119 *offset2
= gen_int_mode (second_offset
, Pmode
);
6124 /* Return the binary representation of floating point constant VALUE in INTVAL.
6125 If the value cannot be converted, return false without setting INTVAL.
6126 The conversion is done in the given MODE. */
6128 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6131 /* We make a general exception for 0. */
6132 if (aarch64_float_const_zero_rtx_p (value
))
6138 scalar_float_mode mode
;
6139 if (GET_CODE (value
) != CONST_DOUBLE
6140 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6141 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6142 /* Only support up to DF mode. */
6143 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6146 unsigned HOST_WIDE_INT ival
= 0;
6149 real_to_target (res
,
6150 CONST_DOUBLE_REAL_VALUE (value
),
6151 REAL_MODE_FORMAT (mode
));
6155 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6156 ival
= zext_hwi (res
[order
], 32);
6157 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6160 ival
= zext_hwi (res
[0], 32);
6166 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6167 single MOV(+MOVK) followed by an FMOV. */
6169 aarch64_float_const_rtx_p (rtx x
)
6171 machine_mode mode
= GET_MODE (x
);
6172 if (mode
== VOIDmode
)
6175 /* Determine whether it's cheaper to write float constants as
6176 mov/movk pairs over ldr/adrp pairs. */
6177 unsigned HOST_WIDE_INT ival
;
6179 if (GET_CODE (x
) == CONST_DOUBLE
6180 && SCALAR_FLOAT_MODE_P (mode
)
6181 && aarch64_reinterpret_float_as_int (x
, &ival
))
6183 scalar_int_mode imode
= (mode
== HFmode
6185 : int_mode_for_mode (mode
).require ());
6186 int num_instr
= aarch64_internal_mov_immediate
6187 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6188 return num_instr
< 3;
6194 /* Return TRUE if rtx X is immediate constant 0.0 */
6196 aarch64_float_const_zero_rtx_p (rtx x
)
6198 if (GET_MODE (x
) == VOIDmode
)
6201 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6202 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6203 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6206 /* Return TRUE if rtx X is immediate constant that fits in a single
6207 MOVI immediate operation. */
6209 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6215 scalar_int_mode imode
;
6216 unsigned HOST_WIDE_INT ival
;
6218 if (GET_CODE (x
) == CONST_DOUBLE
6219 && SCALAR_FLOAT_MODE_P (mode
))
6221 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6224 /* We make a general exception for 0. */
6225 if (aarch64_float_const_zero_rtx_p (x
))
6228 imode
= int_mode_for_mode (mode
).require ();
6230 else if (GET_CODE (x
) == CONST_INT
6231 && is_a
<scalar_int_mode
> (mode
, &imode
))
6236 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6237 a 128 bit vector mode. */
6238 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6240 vmode
= aarch64_simd_container_mode (imode
, width
);
6241 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6243 return aarch64_simd_valid_immediate (v_op
, NULL
);
6247 /* Return the fixed registers used for condition codes. */
6250 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6253 *p2
= INVALID_REGNUM
;
6257 /* This function is used by the call expanders of the machine description.
6258 RESULT is the register in which the result is returned. It's NULL for
6259 "call" and "sibcall".
6260 MEM is the location of the function call.
6261 SIBCALL indicates whether this function call is normal call or sibling call.
6262 It will generate different pattern accordingly. */
6265 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6267 rtx call
, callee
, tmp
;
6271 gcc_assert (MEM_P (mem
));
6272 callee
= XEXP (mem
, 0);
6273 mode
= GET_MODE (callee
);
6274 gcc_assert (mode
== Pmode
);
6276 /* Decide if we should generate indirect calls by loading the
6277 address of the callee into a register before performing
6278 the branch-and-link. */
6279 if (SYMBOL_REF_P (callee
)
6280 ? (aarch64_is_long_call_p (callee
)
6281 || aarch64_is_noplt_call_p (callee
))
6283 XEXP (mem
, 0) = force_reg (mode
, callee
);
6285 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
6287 if (result
!= NULL_RTX
)
6288 call
= gen_rtx_SET (result
, call
);
6293 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
6295 vec
= gen_rtvec (2, call
, tmp
);
6296 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
6298 aarch64_emit_call_insn (call
);
6301 /* Emit call insn with PAT and do aarch64-specific handling. */
6304 aarch64_emit_call_insn (rtx pat
)
6306 rtx insn
= emit_call_insn (pat
);
6308 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6309 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6310 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6314 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6316 /* All floating point compares return CCFP if it is an equality
6317 comparison, and CCFPE otherwise. */
6318 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6345 /* Equality comparisons of short modes against zero can be performed
6346 using the TST instruction with the appropriate bitmask. */
6347 if (y
== const0_rtx
&& REG_P (x
)
6348 && (code
== EQ
|| code
== NE
)
6349 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6352 /* Similarly, comparisons of zero_extends from shorter modes can
6353 be performed using an ANDS with an immediate mask. */
6354 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6355 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6356 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6357 && (code
== EQ
|| code
== NE
))
6360 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6362 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6363 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6364 || GET_CODE (x
) == NEG
6365 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6366 && CONST_INT_P (XEXP (x
, 2)))))
6369 /* A compare with a shifted operand. Because of canonicalization,
6370 the comparison will have to be swapped when we emit the assembly
6372 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6373 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6374 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6375 || GET_CODE (x
) == LSHIFTRT
6376 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6379 /* Similarly for a negated operand, but we can only do this for
6381 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6382 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6383 && (code
== EQ
|| code
== NE
)
6384 && GET_CODE (x
) == NEG
)
6387 /* A test for unsigned overflow. */
6388 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6390 && GET_CODE (x
) == PLUS
6391 && GET_CODE (y
) == ZERO_EXTEND
)
6394 /* For everything else, return CCmode. */
6399 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6402 aarch64_get_condition_code (rtx x
)
6404 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6405 enum rtx_code comp_code
= GET_CODE (x
);
6407 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6408 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6409 return aarch64_get_condition_code_1 (mode
, comp_code
);
6413 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6421 case GE
: return AARCH64_GE
;
6422 case GT
: return AARCH64_GT
;
6423 case LE
: return AARCH64_LS
;
6424 case LT
: return AARCH64_MI
;
6425 case NE
: return AARCH64_NE
;
6426 case EQ
: return AARCH64_EQ
;
6427 case ORDERED
: return AARCH64_VC
;
6428 case UNORDERED
: return AARCH64_VS
;
6429 case UNLT
: return AARCH64_LT
;
6430 case UNLE
: return AARCH64_LE
;
6431 case UNGT
: return AARCH64_HI
;
6432 case UNGE
: return AARCH64_PL
;
6440 case NE
: return AARCH64_NE
;
6441 case EQ
: return AARCH64_EQ
;
6442 case GE
: return AARCH64_GE
;
6443 case GT
: return AARCH64_GT
;
6444 case LE
: return AARCH64_LE
;
6445 case LT
: return AARCH64_LT
;
6446 case GEU
: return AARCH64_CS
;
6447 case GTU
: return AARCH64_HI
;
6448 case LEU
: return AARCH64_LS
;
6449 case LTU
: return AARCH64_CC
;
6457 case NE
: return AARCH64_NE
;
6458 case EQ
: return AARCH64_EQ
;
6459 case GE
: return AARCH64_LE
;
6460 case GT
: return AARCH64_LT
;
6461 case LE
: return AARCH64_GE
;
6462 case LT
: return AARCH64_GT
;
6463 case GEU
: return AARCH64_LS
;
6464 case GTU
: return AARCH64_CC
;
6465 case LEU
: return AARCH64_CS
;
6466 case LTU
: return AARCH64_HI
;
6474 case NE
: return AARCH64_NE
;
6475 case EQ
: return AARCH64_EQ
;
6476 case GE
: return AARCH64_PL
;
6477 case LT
: return AARCH64_MI
;
6485 case NE
: return AARCH64_NE
;
6486 case EQ
: return AARCH64_EQ
;
6494 case NE
: return AARCH64_CS
;
6495 case EQ
: return AARCH64_CC
;
6508 aarch64_const_vec_all_same_in_range_p (rtx x
,
6509 HOST_WIDE_INT minval
,
6510 HOST_WIDE_INT maxval
)
6513 return (const_vec_duplicate_p (x
, &elt
)
6514 && CONST_INT_P (elt
)
6515 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6519 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6521 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6524 /* Return true if VEC is a constant in which every element is in the range
6525 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6528 aarch64_const_vec_all_in_range_p (rtx vec
,
6529 HOST_WIDE_INT minval
,
6530 HOST_WIDE_INT maxval
)
6532 if (GET_CODE (vec
) != CONST_VECTOR
6533 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6537 if (!CONST_VECTOR_STEPPED_P (vec
))
6538 nunits
= const_vector_encoded_nelts (vec
);
6539 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6542 for (int i
= 0; i
< nunits
; i
++)
6544 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6545 if (!CONST_INT_P (vec_elem
)
6546 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6553 #define AARCH64_CC_V 1
6554 #define AARCH64_CC_C (1 << 1)
6555 #define AARCH64_CC_Z (1 << 2)
6556 #define AARCH64_CC_N (1 << 3)
6558 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6559 static const int aarch64_nzcv_codes
[] =
6561 0, /* EQ, Z == 1. */
6562 AARCH64_CC_Z
, /* NE, Z == 0. */
6563 0, /* CS, C == 1. */
6564 AARCH64_CC_C
, /* CC, C == 0. */
6565 0, /* MI, N == 1. */
6566 AARCH64_CC_N
, /* PL, N == 0. */
6567 0, /* VS, V == 1. */
6568 AARCH64_CC_V
, /* VC, V == 0. */
6569 0, /* HI, C ==1 && Z == 0. */
6570 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6571 AARCH64_CC_V
, /* GE, N == V. */
6572 0, /* LT, N != V. */
6573 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6574 0, /* LE, !(Z == 0 && N == V). */
6579 /* Print floating-point vector immediate operand X to F, negating it
6580 first if NEGATE is true. Return true on success, false if it isn't
6581 a constant we can handle. */
6584 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6588 if (!const_vec_duplicate_p (x
, &elt
))
6591 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6593 r
= real_value_negate (&r
);
6595 /* We only handle the SVE single-bit immediates here. */
6596 if (real_equal (&r
, &dconst0
))
6597 asm_fprintf (f
, "0.0");
6598 else if (real_equal (&r
, &dconst1
))
6599 asm_fprintf (f
, "1.0");
6600 else if (real_equal (&r
, &dconsthalf
))
6601 asm_fprintf (f
, "0.5");
6608 /* Return the equivalent letter for size. */
6610 sizetochar (int size
)
6614 case 64: return 'd';
6615 case 32: return 's';
6616 case 16: return 'h';
6617 case 8 : return 'b';
6618 default: gcc_unreachable ();
6622 /* Print operand X to file F in a target specific manner according to CODE.
6623 The acceptable formatting commands given by CODE are:
6624 'c': An integer or symbol address without a preceding #
6626 'C': Take the duplicated element in a vector constant
6627 and print it in hex.
6628 'D': Take the duplicated element in a vector constant
6629 and print it as an unsigned integer, in decimal.
6630 'e': Print the sign/zero-extend size as a character 8->b,
6632 'p': Prints N such that 2^N == X (X must be power of 2 and
6634 'P': Print the number of non-zero bits in X (a const_int).
6635 'H': Print the higher numbered register of a pair (TImode)
6637 'm': Print a condition (eq, ne, etc).
6638 'M': Same as 'm', but invert condition.
6639 'N': Take the duplicated element in a vector constant
6640 and print the negative of it in decimal.
6641 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6642 'S/T/U/V': Print a FP/SIMD register name for a register list.
6643 The register printed is the FP/SIMD register name
6644 of X + 0/1/2/3 for S/T/U/V.
6645 'R': Print a scalar FP/SIMD register name + 1.
6646 'X': Print bottom 16 bits of integer constant in hex.
6647 'w/x': Print a general register name or the zero register
6649 '0': Print a normal operand, if it's a general register,
6650 then we assume DImode.
6651 'k': Print NZCV for conditional compare instructions.
6652 'A': Output address constant representing the first
6653 argument of X, specifying a relocation offset
6655 'L': Output constant address specified by X
6656 with a relocation offset if appropriate.
6657 'G': Prints address of X, specifying a PC relative
6658 relocation mode if appropriate.
6659 'y': Output address of LDP or STP - this is used for
6660 some LDP/STPs which don't use a PARALLEL in their
6661 pattern (so the mode needs to be adjusted).
6662 'z': Output address of a typical LDP or STP. */
6665 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6671 switch (GET_CODE (x
))
6674 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6678 output_addr_const (f
, x
);
6682 if (GET_CODE (XEXP (x
, 0)) == PLUS
6683 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6685 output_addr_const (f
, x
);
6691 output_operand_lossage ("unsupported operand for code '%c'", code
);
6699 if (!CONST_INT_P (x
)
6700 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6702 output_operand_lossage ("invalid operand for '%%%c'", code
);
6718 output_operand_lossage ("invalid operand for '%%%c'", code
);
6728 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6730 output_operand_lossage ("invalid operand for '%%%c'", code
);
6734 asm_fprintf (f
, "%d", n
);
6739 if (!CONST_INT_P (x
))
6741 output_operand_lossage ("invalid operand for '%%%c'", code
);
6745 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6749 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6751 output_operand_lossage ("invalid operand for '%%%c'", code
);
6755 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6762 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6763 if (x
== const_true_rtx
)
6770 if (!COMPARISON_P (x
))
6772 output_operand_lossage ("invalid operand for '%%%c'", code
);
6776 cond_code
= aarch64_get_condition_code (x
);
6777 gcc_assert (cond_code
>= 0);
6779 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6780 fputs (aarch64_condition_codes
[cond_code
], f
);
6785 if (!const_vec_duplicate_p (x
, &elt
))
6787 output_operand_lossage ("invalid vector constant");
6791 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6792 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6793 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6794 && aarch64_print_vector_float_operand (f
, x
, true))
6798 output_operand_lossage ("invalid vector constant");
6808 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6810 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6813 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6820 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6822 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6825 asm_fprintf (f
, "%c%d",
6826 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6827 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6831 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6833 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6836 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6840 if (!CONST_INT_P (x
))
6842 output_operand_lossage ("invalid operand for '%%%c'", code
);
6845 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6850 /* Print a replicated constant in hex. */
6851 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6853 output_operand_lossage ("invalid operand for '%%%c'", code
);
6856 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6857 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6863 /* Print a replicated constant in decimal, treating it as
6865 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6867 output_operand_lossage ("invalid operand for '%%%c'", code
);
6870 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6871 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6878 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6880 asm_fprintf (f
, "%czr", code
);
6884 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6886 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6890 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6892 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6901 output_operand_lossage ("missing operand");
6905 switch (GET_CODE (x
))
6908 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6910 if (REG_NREGS (x
) == 1)
6911 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6915 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
6916 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
6917 REGNO (x
) - V0_REGNUM
, suffix
,
6918 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
6922 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6926 output_address (GET_MODE (x
), XEXP (x
, 0));
6931 output_addr_const (asm_out_file
, x
);
6935 asm_fprintf (f
, "%wd", INTVAL (x
));
6939 if (!VECTOR_MODE_P (GET_MODE (x
)))
6941 output_addr_const (asm_out_file
, x
);
6947 if (!const_vec_duplicate_p (x
, &elt
))
6949 output_operand_lossage ("invalid vector constant");
6953 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6954 asm_fprintf (f
, "%wd", INTVAL (elt
));
6955 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6956 && aarch64_print_vector_float_operand (f
, x
, false))
6960 output_operand_lossage ("invalid vector constant");
6966 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6967 be getting CONST_DOUBLEs holding integers. */
6968 gcc_assert (GET_MODE (x
) != VOIDmode
);
6969 if (aarch64_float_const_zero_rtx_p (x
))
6974 else if (aarch64_float_const_representable_p (x
))
6977 char float_buf
[buf_size
] = {'\0'};
6978 real_to_decimal_for_mode (float_buf
,
6979 CONST_DOUBLE_REAL_VALUE (x
),
6982 asm_fprintf (asm_out_file
, "%s", float_buf
);
6986 output_operand_lossage ("invalid constant");
6989 output_operand_lossage ("invalid operand");
6995 if (GET_CODE (x
) == HIGH
)
6998 switch (aarch64_classify_symbolic_expression (x
))
7000 case SYMBOL_SMALL_GOT_4G
:
7001 asm_fprintf (asm_out_file
, ":got:");
7004 case SYMBOL_SMALL_TLSGD
:
7005 asm_fprintf (asm_out_file
, ":tlsgd:");
7008 case SYMBOL_SMALL_TLSDESC
:
7009 asm_fprintf (asm_out_file
, ":tlsdesc:");
7012 case SYMBOL_SMALL_TLSIE
:
7013 asm_fprintf (asm_out_file
, ":gottprel:");
7016 case SYMBOL_TLSLE24
:
7017 asm_fprintf (asm_out_file
, ":tprel:");
7020 case SYMBOL_TINY_GOT
:
7027 output_addr_const (asm_out_file
, x
);
7031 switch (aarch64_classify_symbolic_expression (x
))
7033 case SYMBOL_SMALL_GOT_4G
:
7034 asm_fprintf (asm_out_file
, ":lo12:");
7037 case SYMBOL_SMALL_TLSGD
:
7038 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7041 case SYMBOL_SMALL_TLSDESC
:
7042 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7045 case SYMBOL_SMALL_TLSIE
:
7046 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7049 case SYMBOL_TLSLE12
:
7050 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7053 case SYMBOL_TLSLE24
:
7054 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7057 case SYMBOL_TINY_GOT
:
7058 asm_fprintf (asm_out_file
, ":got:");
7061 case SYMBOL_TINY_TLSIE
:
7062 asm_fprintf (asm_out_file
, ":gottprel:");
7068 output_addr_const (asm_out_file
, x
);
7072 switch (aarch64_classify_symbolic_expression (x
))
7074 case SYMBOL_TLSLE24
:
7075 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7080 output_addr_const (asm_out_file
, x
);
7085 HOST_WIDE_INT cond_code
;
7087 if (!CONST_INT_P (x
))
7089 output_operand_lossage ("invalid operand for '%%%c'", code
);
7093 cond_code
= INTVAL (x
);
7094 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7095 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7102 machine_mode mode
= GET_MODE (x
);
7104 if (GET_CODE (x
) != MEM
7105 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7107 output_operand_lossage ("invalid operand for '%%%c'", code
);
7112 /* LDP/STP which uses a single double-width memory operand.
7113 Adjust the mode to appear like a typical LDP/STP.
7114 Currently this is supported for 16-byte accesses only. */
7117 if (!aarch64_print_ldpstp_address (f
, mode
, XEXP (x
, 0)))
7118 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7123 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7128 /* Print address 'x' of a memory access with mode 'mode'.
7129 'op' is the context required by aarch64_classify_address. It can either be
7130 MEM for a normal memory access or PARALLEL for LDP/STP. */
7132 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7133 aarch64_addr_query_type type
)
7135 struct aarch64_address_info addr
;
7138 /* Check all addresses are Pmode - including ILP32. */
7139 if (GET_MODE (x
) != Pmode
)
7140 output_operand_lossage ("invalid address mode");
7142 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7145 case ADDRESS_REG_IMM
:
7146 if (known_eq (addr
.const_offset
, 0))
7147 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7148 else if (aarch64_sve_data_mode_p (mode
))
7151 = exact_div (addr
.const_offset
,
7152 BYTES_PER_SVE_VECTOR
).to_constant ();
7153 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7154 reg_names
[REGNO (addr
.base
)], vnum
);
7156 else if (aarch64_sve_pred_mode_p (mode
))
7159 = exact_div (addr
.const_offset
,
7160 BYTES_PER_SVE_PRED
).to_constant ();
7161 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7162 reg_names
[REGNO (addr
.base
)], vnum
);
7165 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7166 INTVAL (addr
.offset
));
7169 case ADDRESS_REG_REG
:
7170 if (addr
.shift
== 0)
7171 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7172 reg_names
[REGNO (addr
.offset
)]);
7174 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7175 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7178 case ADDRESS_REG_UXTW
:
7179 if (addr
.shift
== 0)
7180 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7181 REGNO (addr
.offset
) - R0_REGNUM
);
7183 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7184 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7187 case ADDRESS_REG_SXTW
:
7188 if (addr
.shift
== 0)
7189 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7190 REGNO (addr
.offset
) - R0_REGNUM
);
7192 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7193 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7196 case ADDRESS_REG_WB
:
7197 /* Writeback is only supported for fixed-width modes. */
7198 size
= GET_MODE_SIZE (mode
).to_constant ();
7199 switch (GET_CODE (x
))
7202 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7205 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7208 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7211 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7214 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7215 INTVAL (addr
.offset
));
7218 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7219 INTVAL (addr
.offset
));
7226 case ADDRESS_LO_SUM
:
7227 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7228 output_addr_const (f
, addr
.offset
);
7229 asm_fprintf (f
, "]");
7232 case ADDRESS_SYMBOLIC
:
7233 output_addr_const (f
, x
);
7240 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7242 aarch64_print_ldpstp_address (FILE *f
, machine_mode mode
, rtx x
)
7244 return aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_LDP_STP
);
7247 /* Print address 'x' of a memory access with mode 'mode'. */
7249 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7251 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7252 output_addr_const (f
, x
);
7256 aarch64_label_mentioned_p (rtx x
)
7261 if (GET_CODE (x
) == LABEL_REF
)
7264 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7265 referencing instruction, but they are constant offsets, not
7267 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7270 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
7271 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
7277 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
7278 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
7281 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
7288 /* Implement REGNO_REG_CLASS. */
7291 aarch64_regno_regclass (unsigned regno
)
7293 if (GP_REGNUM_P (regno
))
7294 return GENERAL_REGS
;
7296 if (regno
== SP_REGNUM
)
7299 if (regno
== FRAME_POINTER_REGNUM
7300 || regno
== ARG_POINTER_REGNUM
)
7301 return POINTER_REGS
;
7303 if (FP_REGNUM_P (regno
))
7304 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
7306 if (PR_REGNUM_P (regno
))
7307 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
7312 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7313 If OFFSET is out of range, return an offset of an anchor point
7314 that is in range. Return 0 otherwise. */
7316 static HOST_WIDE_INT
7317 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
7320 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7322 return (offset
+ 0x400) & ~0x7f0;
7324 /* For offsets that aren't a multiple of the access size, the limit is
7326 if (offset
& (size
- 1))
7328 /* BLKmode typically uses LDP of X-registers. */
7329 if (mode
== BLKmode
)
7330 return (offset
+ 512) & ~0x3ff;
7331 return (offset
+ 0x100) & ~0x1ff;
7334 /* Small negative offsets are supported. */
7335 if (IN_RANGE (offset
, -256, 0))
7338 if (mode
== TImode
|| mode
== TFmode
)
7339 return (offset
+ 0x100) & ~0x1ff;
7341 /* Use 12-bit offset by access size. */
7342 return offset
& (~0xfff * size
);
7346 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7348 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7349 where mask is selected by alignment and size of the offset.
7350 We try to pick as large a range for the offset as possible to
7351 maximize the chance of a CSE. However, for aligned addresses
7352 we limit the range to 4k so that structures with different sized
7353 elements are likely to use the same base. We need to be careful
7354 not to split a CONST for some forms of address expression, otherwise
7355 it will generate sub-optimal code. */
7357 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7359 rtx base
= XEXP (x
, 0);
7360 rtx offset_rtx
= XEXP (x
, 1);
7361 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7363 if (GET_CODE (base
) == PLUS
)
7365 rtx op0
= XEXP (base
, 0);
7366 rtx op1
= XEXP (base
, 1);
7368 /* Force any scaling into a temp for CSE. */
7369 op0
= force_reg (Pmode
, op0
);
7370 op1
= force_reg (Pmode
, op1
);
7372 /* Let the pointer register be in op0. */
7373 if (REG_POINTER (op1
))
7374 std::swap (op0
, op1
);
7376 /* If the pointer is virtual or frame related, then we know that
7377 virtual register instantiation or register elimination is going
7378 to apply a second constant. We want the two constants folded
7379 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7380 if (virt_or_elim_regno_p (REGNO (op0
)))
7382 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7383 NULL_RTX
, true, OPTAB_DIRECT
);
7384 return gen_rtx_PLUS (Pmode
, base
, op1
);
7387 /* Otherwise, in order to encourage CSE (and thence loop strength
7388 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7389 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7390 NULL_RTX
, true, OPTAB_DIRECT
);
7391 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7395 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7397 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7399 if (base_offset
!= 0)
7401 base
= plus_constant (Pmode
, base
, base_offset
);
7402 base
= force_operand (base
, NULL_RTX
);
7403 return plus_constant (Pmode
, base
, offset
- base_offset
);
7411 /* Return the reload icode required for a constant pool in mode. */
7412 static enum insn_code
7413 aarch64_constant_pool_reload_icode (machine_mode mode
)
7418 return CODE_FOR_aarch64_reload_movcpsfdi
;
7421 return CODE_FOR_aarch64_reload_movcpdfdi
;
7424 return CODE_FOR_aarch64_reload_movcptfdi
;
7427 return CODE_FOR_aarch64_reload_movcpv8qidi
;
7430 return CODE_FOR_aarch64_reload_movcpv16qidi
;
7433 return CODE_FOR_aarch64_reload_movcpv4hidi
;
7436 return CODE_FOR_aarch64_reload_movcpv8hidi
;
7439 return CODE_FOR_aarch64_reload_movcpv2sidi
;
7442 return CODE_FOR_aarch64_reload_movcpv4sidi
;
7445 return CODE_FOR_aarch64_reload_movcpv2didi
;
7448 return CODE_FOR_aarch64_reload_movcpv2dfdi
;
7457 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7460 secondary_reload_info
*sri
)
7462 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7463 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7464 comment at the head of aarch64-sve.md for more details about the
7465 big-endian handling. */
7466 if (BYTES_BIG_ENDIAN
7467 && reg_class_subset_p (rclass
, FP_REGS
)
7468 && !((REG_P (x
) && HARD_REGISTER_P (x
))
7469 || aarch64_simd_valid_immediate (x
, NULL
))
7470 && aarch64_sve_data_mode_p (mode
))
7472 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7476 /* If we have to disable direct literal pool loads and stores because the
7477 function is too big, then we need a scratch register. */
7478 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7479 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7480 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7481 && !aarch64_pcrelative_literal_loads
)
7483 sri
->icode
= aarch64_constant_pool_reload_icode (mode
);
7487 /* Without the TARGET_SIMD instructions we cannot move a Q register
7488 to a Q register directly. We need a scratch. */
7489 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7490 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7491 && reg_class_subset_p (rclass
, FP_REGS
))
7494 sri
->icode
= CODE_FOR_aarch64_reload_movtf
;
7495 else if (mode
== TImode
)
7496 sri
->icode
= CODE_FOR_aarch64_reload_movti
;
7500 /* A TFmode or TImode memory access should be handled via an FP_REGS
7501 because AArch64 has richer addressing modes for LDR/STR instructions
7502 than LDP/STP instructions. */
7503 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7504 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7507 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7508 return GENERAL_REGS
;
7514 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7516 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7518 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7519 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7520 if (frame_pointer_needed
)
7521 return to
== HARD_FRAME_POINTER_REGNUM
;
7526 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7528 aarch64_layout_frame ();
7530 if (to
== HARD_FRAME_POINTER_REGNUM
)
7532 if (from
== ARG_POINTER_REGNUM
)
7533 return cfun
->machine
->frame
.hard_fp_offset
;
7535 if (from
== FRAME_POINTER_REGNUM
)
7536 return cfun
->machine
->frame
.hard_fp_offset
7537 - cfun
->machine
->frame
.locals_offset
;
7540 if (to
== STACK_POINTER_REGNUM
)
7542 if (from
== FRAME_POINTER_REGNUM
)
7543 return cfun
->machine
->frame
.frame_size
7544 - cfun
->machine
->frame
.locals_offset
;
7547 return cfun
->machine
->frame
.frame_size
;
7550 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7554 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7558 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7563 aarch64_asm_trampoline_template (FILE *f
)
7567 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7568 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7572 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7573 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7575 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7576 assemble_aligned_integer (4, const0_rtx
);
7577 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7578 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7582 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7584 rtx fnaddr
, mem
, a_tramp
;
7585 const int tramp_code_sz
= 16;
7587 /* Don't need to copy the trailing D-words, we fill those in below. */
7588 emit_block_move (m_tramp
, assemble_trampoline_template (),
7589 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7590 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7591 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7592 if (GET_MODE (fnaddr
) != ptr_mode
)
7593 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7594 emit_move_insn (mem
, fnaddr
);
7596 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7597 emit_move_insn (mem
, chain_value
);
7599 /* XXX We should really define a "clear_cache" pattern and use
7600 gen_clear_cache(). */
7601 a_tramp
= XEXP (m_tramp
, 0);
7602 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7603 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7604 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7608 static unsigned char
7609 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7611 /* ??? Logically we should only need to provide a value when
7612 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7613 can hold MODE, but at the moment we need to handle all modes.
7614 Just ignore any runtime parts for registers that can't store them. */
7615 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7619 case TAILCALL_ADDR_REGS
:
7623 case POINTER_AND_FP_REGS
:
7626 if (aarch64_sve_data_mode_p (mode
)
7627 && constant_multiple_p (GET_MODE_SIZE (mode
),
7628 BYTES_PER_SVE_VECTOR
, &nregs
))
7630 return (aarch64_vector_data_mode_p (mode
)
7631 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7632 : CEIL (lowest_size
, UNITS_PER_WORD
));
7649 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7651 if (regclass
== POINTER_REGS
)
7652 return GENERAL_REGS
;
7654 if (regclass
== STACK_REG
)
7657 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7663 /* Register eliminiation can result in a request for
7664 SP+constant->FP_REGS. We cannot support such operations which
7665 use SP as source and an FP_REG as destination, so reject out
7667 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7669 rtx lhs
= XEXP (x
, 0);
7671 /* Look through a possible SUBREG introduced by ILP32. */
7672 if (GET_CODE (lhs
) == SUBREG
)
7673 lhs
= SUBREG_REG (lhs
);
7675 gcc_assert (REG_P (lhs
));
7676 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7685 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7687 asm_fprintf (f
, "%U%s", name
);
7691 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7693 if (priority
== DEFAULT_INIT_PRIORITY
)
7694 default_ctor_section_asm_out_constructor (symbol
, priority
);
7698 /* While priority is known to be in range [0, 65535], so 18 bytes
7699 would be enough, the compiler might not know that. To avoid
7700 -Wformat-truncation false positive, use a larger size. */
7702 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7703 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7704 switch_to_section (s
);
7705 assemble_align (POINTER_SIZE
);
7706 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7711 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7713 if (priority
== DEFAULT_INIT_PRIORITY
)
7714 default_dtor_section_asm_out_destructor (symbol
, priority
);
7718 /* While priority is known to be in range [0, 65535], so 18 bytes
7719 would be enough, the compiler might not know that. To avoid
7720 -Wformat-truncation false positive, use a larger size. */
7722 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7723 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7724 switch_to_section (s
);
7725 assemble_align (POINTER_SIZE
);
7726 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7731 aarch64_output_casesi (rtx
*operands
)
7735 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7737 static const char *const patterns
[4][2] =
7740 "ldrb\t%w3, [%0,%w1,uxtw]",
7741 "add\t%3, %4, %w3, sxtb #2"
7744 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7745 "add\t%3, %4, %w3, sxth #2"
7748 "ldr\t%w3, [%0,%w1,uxtw #2]",
7749 "add\t%3, %4, %w3, sxtw #2"
7751 /* We assume that DImode is only generated when not optimizing and
7752 that we don't really need 64-bit address offsets. That would
7753 imply an object file with 8GB of code in a single function! */
7755 "ldr\t%w3, [%0,%w1,uxtw #2]",
7756 "add\t%3, %4, %w3, sxtw #2"
7760 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7762 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7763 index
= exact_log2 (GET_MODE_SIZE (mode
));
7765 gcc_assert (index
>= 0 && index
<= 3);
7767 /* Need to implement table size reduction, by chaning the code below. */
7768 output_asm_insn (patterns
[index
][0], operands
);
7769 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7770 snprintf (buf
, sizeof (buf
),
7771 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7772 output_asm_insn (buf
, operands
);
7773 output_asm_insn (patterns
[index
][1], operands
);
7774 output_asm_insn ("br\t%3", operands
);
7775 assemble_label (asm_out_file
, label
);
7780 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7781 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7785 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7787 if (shift
>= 0 && shift
<= 3)
7790 for (size
= 8; size
<= 32; size
*= 2)
7792 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7793 if (mask
== bits
<< shift
)
7800 /* Constant pools are per function only when PC relative
7801 literal loads are true or we are in the large memory
7805 aarch64_can_use_per_function_literal_pools_p (void)
7807 return (aarch64_pcrelative_literal_loads
7808 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7812 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7814 /* We can't use blocks for constants when we're using a per-function
7816 return !aarch64_can_use_per_function_literal_pools_p ();
7819 /* Select appropriate section for constants depending
7820 on where we place literal pools. */
7823 aarch64_select_rtx_section (machine_mode mode
,
7825 unsigned HOST_WIDE_INT align
)
7827 if (aarch64_can_use_per_function_literal_pools_p ())
7828 return function_section (current_function_decl
);
7830 return default_elf_select_rtx_section (mode
, x
, align
);
7833 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7835 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7836 HOST_WIDE_INT offset
)
7838 /* When using per-function literal pools, we must ensure that any code
7839 section is aligned to the minimal instruction length, lest we get
7840 errors from the assembler re "unaligned instructions". */
7841 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7842 ASM_OUTPUT_ALIGN (f
, 2);
7847 /* Helper function for rtx cost calculation. Strip a shift expression
7848 from X. Returns the inner operand if successful, or the original
7849 expression on failure. */
7851 aarch64_strip_shift (rtx x
)
7855 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7856 we can convert both to ROR during final output. */
7857 if ((GET_CODE (op
) == ASHIFT
7858 || GET_CODE (op
) == ASHIFTRT
7859 || GET_CODE (op
) == LSHIFTRT
7860 || GET_CODE (op
) == ROTATERT
7861 || GET_CODE (op
) == ROTATE
)
7862 && CONST_INT_P (XEXP (op
, 1)))
7863 return XEXP (op
, 0);
7865 if (GET_CODE (op
) == MULT
7866 && CONST_INT_P (XEXP (op
, 1))
7867 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7868 return XEXP (op
, 0);
7873 /* Helper function for rtx cost calculation. Strip an extend
7874 expression from X. Returns the inner operand if successful, or the
7875 original expression on failure. We deal with a number of possible
7876 canonicalization variations here. If STRIP_SHIFT is true, then
7877 we can strip off a shift also. */
7879 aarch64_strip_extend (rtx x
, bool strip_shift
)
7881 scalar_int_mode mode
;
7884 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7887 /* Zero and sign extraction of a widened value. */
7888 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7889 && XEXP (op
, 2) == const0_rtx
7890 && GET_CODE (XEXP (op
, 0)) == MULT
7891 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7893 return XEXP (XEXP (op
, 0), 0);
7895 /* It can also be represented (for zero-extend) as an AND with an
7897 if (GET_CODE (op
) == AND
7898 && GET_CODE (XEXP (op
, 0)) == MULT
7899 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7900 && CONST_INT_P (XEXP (op
, 1))
7901 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7902 INTVAL (XEXP (op
, 1))) != 0)
7903 return XEXP (XEXP (op
, 0), 0);
7905 /* Now handle extended register, as this may also have an optional
7906 left shift by 1..4. */
7908 && GET_CODE (op
) == ASHIFT
7909 && CONST_INT_P (XEXP (op
, 1))
7910 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7913 if (GET_CODE (op
) == ZERO_EXTEND
7914 || GET_CODE (op
) == SIGN_EXTEND
)
7923 /* Return true iff CODE is a shift supported in combination
7924 with arithmetic instructions. */
7927 aarch64_shift_p (enum rtx_code code
)
7929 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7933 /* Return true iff X is a cheap shift without a sign extend. */
7936 aarch64_cheap_mult_shift_p (rtx x
)
7943 if (!(aarch64_tune_params
.extra_tuning_flags
7944 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7947 if (GET_CODE (op0
) == SIGN_EXTEND
)
7950 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7951 && UINTVAL (op1
) <= 4)
7954 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7957 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7959 if (l2
> 0 && l2
<= 4)
7965 /* Helper function for rtx cost calculation. Calculate the cost of
7966 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7967 Return the calculated cost of the expression, recursing manually in to
7968 operands where needed. */
7971 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7974 const struct cpu_cost_table
*extra_cost
7975 = aarch64_tune_params
.insn_extra_cost
;
7977 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7978 machine_mode mode
= GET_MODE (x
);
7980 gcc_checking_assert (code
== MULT
);
7985 if (VECTOR_MODE_P (mode
))
7986 mode
= GET_MODE_INNER (mode
);
7988 /* Integer multiply/fma. */
7989 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7991 /* The multiply will be canonicalized as a shift, cost it as such. */
7992 if (aarch64_shift_p (GET_CODE (x
))
7993 || (CONST_INT_P (op1
)
7994 && exact_log2 (INTVAL (op1
)) > 0))
7996 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
7997 || GET_CODE (op0
) == SIGN_EXTEND
;
8002 /* If the shift is considered cheap,
8003 then don't add any cost. */
8004 if (aarch64_cheap_mult_shift_p (x
))
8006 else if (REG_P (op1
))
8007 /* ARITH + shift-by-register. */
8008 cost
+= extra_cost
->alu
.arith_shift_reg
;
8010 /* ARITH + extended register. We don't have a cost field
8011 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8012 cost
+= extra_cost
->alu
.extend_arith
;
8014 /* ARITH + shift-by-immediate. */
8015 cost
+= extra_cost
->alu
.arith_shift
;
8018 /* LSL (immediate). */
8019 cost
+= extra_cost
->alu
.shift
;
8022 /* Strip extends as we will have costed them in the case above. */
8024 op0
= aarch64_strip_extend (op0
, true);
8026 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
8031 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8032 compound and let the below cases handle it. After all, MNEG is a
8033 special-case alias of MSUB. */
8034 if (GET_CODE (op0
) == NEG
)
8036 op0
= XEXP (op0
, 0);
8040 /* Integer multiplies or FMAs have zero/sign extending variants. */
8041 if ((GET_CODE (op0
) == ZERO_EXTEND
8042 && GET_CODE (op1
) == ZERO_EXTEND
)
8043 || (GET_CODE (op0
) == SIGN_EXTEND
8044 && GET_CODE (op1
) == SIGN_EXTEND
))
8046 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
8047 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
8052 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8053 cost
+= extra_cost
->mult
[0].extend_add
;
8055 /* MUL/SMULL/UMULL. */
8056 cost
+= extra_cost
->mult
[0].extend
;
8062 /* This is either an integer multiply or a MADD. In both cases
8063 we want to recurse and cost the operands. */
8064 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8065 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8071 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
8074 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
8083 /* Floating-point FMA/FMUL can also support negations of the
8084 operands, unless the rounding mode is upward or downward in
8085 which case FNMUL is different than FMUL with operand negation. */
8086 bool neg0
= GET_CODE (op0
) == NEG
;
8087 bool neg1
= GET_CODE (op1
) == NEG
;
8088 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8091 op0
= XEXP (op0
, 0);
8093 op1
= XEXP (op1
, 0);
8097 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8098 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8101 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8104 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8105 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8111 aarch64_address_cost (rtx x
,
8113 addr_space_t as ATTRIBUTE_UNUSED
,
8116 enum rtx_code c
= GET_CODE (x
);
8117 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8118 struct aarch64_address_info info
;
8122 if (!aarch64_classify_address (&info
, x
, mode
, false))
8124 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8126 /* This is a CONST or SYMBOL ref which will be split
8127 in a different way depending on the code model in use.
8128 Cost it through the generic infrastructure. */
8129 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8130 /* Divide through by the cost of one instruction to
8131 bring it to the same units as the address costs. */
8132 cost_symbol_ref
/= COSTS_N_INSNS (1);
8133 /* The cost is then the cost of preparing the address,
8134 followed by an immediate (possibly 0) offset. */
8135 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8139 /* This is most likely a jump table from a case
8141 return addr_cost
->register_offset
;
8147 case ADDRESS_LO_SUM
:
8148 case ADDRESS_SYMBOLIC
:
8149 case ADDRESS_REG_IMM
:
8150 cost
+= addr_cost
->imm_offset
;
8153 case ADDRESS_REG_WB
:
8154 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8155 cost
+= addr_cost
->pre_modify
;
8156 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8157 cost
+= addr_cost
->post_modify
;
8163 case ADDRESS_REG_REG
:
8164 cost
+= addr_cost
->register_offset
;
8167 case ADDRESS_REG_SXTW
:
8168 cost
+= addr_cost
->register_sextend
;
8171 case ADDRESS_REG_UXTW
:
8172 cost
+= addr_cost
->register_zextend
;
8182 /* For the sake of calculating the cost of the shifted register
8183 component, we can treat same sized modes in the same way. */
8184 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8185 cost
+= addr_cost
->addr_scale_costs
.hi
;
8186 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8187 cost
+= addr_cost
->addr_scale_costs
.si
;
8188 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8189 cost
+= addr_cost
->addr_scale_costs
.di
;
8191 /* We can't tell, or this is a 128-bit vector. */
8192 cost
+= addr_cost
->addr_scale_costs
.ti
;
8198 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8199 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8203 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8205 /* When optimizing for speed, use the cost of unpredictable branches. */
8206 const struct cpu_branch_cost
*branch_costs
=
8207 aarch64_tune_params
.branch_costs
;
8209 if (!speed_p
|| predictable_p
)
8210 return branch_costs
->predictable
;
8212 return branch_costs
->unpredictable
;
8215 /* Return true if the RTX X in mode MODE is a zero or sign extract
8216 usable in an ADD or SUB (extended register) instruction. */
8218 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8220 /* Catch add with a sign extract.
8221 This is add_<optab><mode>_multp2. */
8222 if (GET_CODE (x
) == SIGN_EXTRACT
8223 || GET_CODE (x
) == ZERO_EXTRACT
)
8225 rtx op0
= XEXP (x
, 0);
8226 rtx op1
= XEXP (x
, 1);
8227 rtx op2
= XEXP (x
, 2);
8229 if (GET_CODE (op0
) == MULT
8230 && CONST_INT_P (op1
)
8231 && op2
== const0_rtx
8232 && CONST_INT_P (XEXP (op0
, 1))
8233 && aarch64_is_extend_from_extract (mode
,
8240 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8242 else if (GET_CODE (x
) == SIGN_EXTEND
8243 || GET_CODE (x
) == ZERO_EXTEND
)
8244 return REG_P (XEXP (x
, 0));
8250 aarch64_frint_unspec_p (unsigned int u
)
8268 /* Return true iff X is an rtx that will match an extr instruction
8269 i.e. as described in the *extr<mode>5_insn family of patterns.
8270 OP0 and OP1 will be set to the operands of the shifts involved
8271 on success and will be NULL_RTX otherwise. */
8274 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8277 scalar_int_mode mode
;
8278 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8281 *res_op0
= NULL_RTX
;
8282 *res_op1
= NULL_RTX
;
8284 if (GET_CODE (x
) != IOR
)
8290 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8291 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8293 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8294 if (GET_CODE (op1
) == ASHIFT
)
8295 std::swap (op0
, op1
);
8297 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8300 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8301 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8303 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8304 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8306 *res_op0
= XEXP (op0
, 0);
8307 *res_op1
= XEXP (op1
, 0);
8315 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8316 storing it in *COST. Result is true if the total cost of the operation
8317 has now been calculated. */
8319 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
8323 enum rtx_code cmpcode
;
8325 if (COMPARISON_P (op0
))
8327 inner
= XEXP (op0
, 0);
8328 comparator
= XEXP (op0
, 1);
8329 cmpcode
= GET_CODE (op0
);
8334 comparator
= const0_rtx
;
8338 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8340 /* Conditional branch. */
8341 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8345 if (cmpcode
== NE
|| cmpcode
== EQ
)
8347 if (comparator
== const0_rtx
)
8349 /* TBZ/TBNZ/CBZ/CBNZ. */
8350 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8352 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8353 ZERO_EXTRACT
, 0, speed
);
8356 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8361 else if (cmpcode
== LT
|| cmpcode
== GE
)
8364 if (comparator
== const0_rtx
)
8369 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8372 if (GET_CODE (op1
) == COMPARE
)
8374 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8375 if (XEXP (op1
, 1) == const0_rtx
)
8379 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8380 const struct cpu_cost_table
*extra_cost
8381 = aarch64_tune_params
.insn_extra_cost
;
8383 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8384 *cost
+= extra_cost
->alu
.arith
;
8386 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8391 /* It's a conditional operation based on the status flags,
8392 so it must be some flavor of CSEL. */
8394 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8395 if (GET_CODE (op1
) == NEG
8396 || GET_CODE (op1
) == NOT
8397 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8398 op1
= XEXP (op1
, 0);
8399 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8401 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8402 op1
= XEXP (op1
, 0);
8403 op2
= XEXP (op2
, 0);
8406 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8407 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8411 /* We don't know what this is, cost all operands. */
8415 /* Check whether X is a bitfield operation of the form shift + extend that
8416 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8417 operand to which the bitfield operation is applied. Otherwise return
8421 aarch64_extend_bitfield_pattern_p (rtx x
)
8423 rtx_code outer_code
= GET_CODE (x
);
8424 machine_mode outer_mode
= GET_MODE (x
);
8426 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8427 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8430 rtx inner
= XEXP (x
, 0);
8431 rtx_code inner_code
= GET_CODE (inner
);
8432 machine_mode inner_mode
= GET_MODE (inner
);
8438 if (CONST_INT_P (XEXP (inner
, 1))
8439 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8440 op
= XEXP (inner
, 0);
8443 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8444 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8445 op
= XEXP (inner
, 0);
8448 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8449 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8450 op
= XEXP (inner
, 0);
8459 /* Return true if the mask and a shift amount from an RTX of the form
8460 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8461 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8464 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8467 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8468 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8469 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8470 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8473 /* Calculate the cost of calculating X, storing it in *COST. Result
8474 is true if the total cost of the operation has now been calculated. */
8476 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8477 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8480 const struct cpu_cost_table
*extra_cost
8481 = aarch64_tune_params
.insn_extra_cost
;
8482 int code
= GET_CODE (x
);
8483 scalar_int_mode int_mode
;
8485 /* By default, assume that everything has equivalent cost to the
8486 cheapest instruction. Any additional costs are applied as a delta
8487 above this default. */
8488 *cost
= COSTS_N_INSNS (1);
8493 /* The cost depends entirely on the operands to SET. */
8498 switch (GET_CODE (op0
))
8503 rtx address
= XEXP (op0
, 0);
8504 if (VECTOR_MODE_P (mode
))
8505 *cost
+= extra_cost
->ldst
.storev
;
8506 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8507 *cost
+= extra_cost
->ldst
.store
;
8508 else if (mode
== SFmode
)
8509 *cost
+= extra_cost
->ldst
.storef
;
8510 else if (mode
== DFmode
)
8511 *cost
+= extra_cost
->ldst
.stored
;
8514 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8518 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8522 if (! REG_P (SUBREG_REG (op0
)))
8523 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8527 /* The cost is one per vector-register copied. */
8528 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8530 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8531 *cost
= COSTS_N_INSNS (nregs
);
8533 /* const0_rtx is in general free, but we will use an
8534 instruction to set a register to 0. */
8535 else if (REG_P (op1
) || op1
== const0_rtx
)
8537 /* The cost is 1 per register copied. */
8538 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8539 *cost
= COSTS_N_INSNS (nregs
);
8542 /* Cost is just the cost of the RHS of the set. */
8543 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8548 /* Bit-field insertion. Strip any redundant widening of
8549 the RHS to meet the width of the target. */
8550 if (GET_CODE (op1
) == SUBREG
)
8551 op1
= SUBREG_REG (op1
);
8552 if ((GET_CODE (op1
) == ZERO_EXTEND
8553 || GET_CODE (op1
) == SIGN_EXTEND
)
8554 && CONST_INT_P (XEXP (op0
, 1))
8555 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8556 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8557 op1
= XEXP (op1
, 0);
8559 if (CONST_INT_P (op1
))
8561 /* MOV immediate is assumed to always be cheap. */
8562 *cost
= COSTS_N_INSNS (1);
8568 *cost
+= extra_cost
->alu
.bfi
;
8569 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8575 /* We can't make sense of this, assume default cost. */
8576 *cost
= COSTS_N_INSNS (1);
8582 /* If an instruction can incorporate a constant within the
8583 instruction, the instruction's expression avoids calling
8584 rtx_cost() on the constant. If rtx_cost() is called on a
8585 constant, then it is usually because the constant must be
8586 moved into a register by one or more instructions.
8588 The exception is constant 0, which can be expressed
8589 as XZR/WZR and is therefore free. The exception to this is
8590 if we have (set (reg) (const0_rtx)) in which case we must cost
8591 the move. However, we can catch that when we cost the SET, so
8592 we don't need to consider that here. */
8593 if (x
== const0_rtx
)
8597 /* To an approximation, building any other constant is
8598 proportionally expensive to the number of instructions
8599 required to build that constant. This is true whether we
8600 are compiling for SPEED or otherwise. */
8601 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8602 int_mode
= word_mode
;
8603 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8604 (NULL_RTX
, x
, false, int_mode
));
8610 /* First determine number of instructions to do the move
8611 as an integer constant. */
8612 if (!aarch64_float_const_representable_p (x
)
8613 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8614 && aarch64_float_const_rtx_p (x
))
8616 unsigned HOST_WIDE_INT ival
;
8617 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8618 gcc_assert (succeed
);
8620 scalar_int_mode imode
= (mode
== HFmode
8622 : int_mode_for_mode (mode
).require ());
8623 int ncost
= aarch64_internal_mov_immediate
8624 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8625 *cost
+= COSTS_N_INSNS (ncost
);
8631 /* mov[df,sf]_aarch64. */
8632 if (aarch64_float_const_representable_p (x
))
8633 /* FMOV (scalar immediate). */
8634 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8635 else if (!aarch64_float_const_zero_rtx_p (x
))
8637 /* This will be a load from memory. */
8639 *cost
+= extra_cost
->ldst
.loadd
;
8641 *cost
+= extra_cost
->ldst
.loadf
;
8644 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8645 or MOV v0.s[0], wzr - neither of which are modeled by the
8646 cost tables. Just use the default cost. */
8656 /* For loads we want the base cost of a load, plus an
8657 approximation for the additional cost of the addressing
8659 rtx address
= XEXP (x
, 0);
8660 if (VECTOR_MODE_P (mode
))
8661 *cost
+= extra_cost
->ldst
.loadv
;
8662 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8663 *cost
+= extra_cost
->ldst
.load
;
8664 else if (mode
== SFmode
)
8665 *cost
+= extra_cost
->ldst
.loadf
;
8666 else if (mode
== DFmode
)
8667 *cost
+= extra_cost
->ldst
.loadd
;
8670 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8679 if (VECTOR_MODE_P (mode
))
8684 *cost
+= extra_cost
->vect
.alu
;
8689 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8691 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8692 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8695 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8699 /* Cost this as SUB wzr, X. */
8700 op0
= CONST0_RTX (mode
);
8705 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8707 /* Support (neg(fma...)) as a single instruction only if
8708 sign of zeros is unimportant. This matches the decision
8709 making in aarch64.md. */
8710 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8713 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8716 if (GET_CODE (op0
) == MULT
)
8719 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8724 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8734 if (VECTOR_MODE_P (mode
))
8735 *cost
+= extra_cost
->vect
.alu
;
8737 *cost
+= extra_cost
->alu
.clz
;
8746 if (op1
== const0_rtx
8747 && GET_CODE (op0
) == AND
)
8750 mode
= GET_MODE (op0
);
8754 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8756 /* TODO: A write to the CC flags possibly costs extra, this
8757 needs encoding in the cost tables. */
8759 mode
= GET_MODE (op0
);
8761 if (GET_CODE (op0
) == AND
)
8767 if (GET_CODE (op0
) == PLUS
)
8769 /* ADDS (and CMN alias). */
8774 if (GET_CODE (op0
) == MINUS
)
8781 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8782 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8783 && CONST_INT_P (XEXP (op0
, 2)))
8785 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8786 Handle it here directly rather than going to cost_logic
8787 since we know the immediate generated for the TST is valid
8788 so we can avoid creating an intermediate rtx for it only
8789 for costing purposes. */
8791 *cost
+= extra_cost
->alu
.logical
;
8793 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8794 ZERO_EXTRACT
, 0, speed
);
8798 if (GET_CODE (op1
) == NEG
)
8802 *cost
+= extra_cost
->alu
.arith
;
8804 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8805 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8811 Compare can freely swap the order of operands, and
8812 canonicalization puts the more complex operation first.
8813 But the integer MINUS logic expects the shift/extend
8814 operation in op1. */
8816 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8824 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8828 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8830 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8832 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8833 /* FCMP supports constant 0.0 for no extra cost. */
8839 if (VECTOR_MODE_P (mode
))
8841 /* Vector compare. */
8843 *cost
+= extra_cost
->vect
.alu
;
8845 if (aarch64_float_const_zero_rtx_p (op1
))
8847 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8861 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8863 /* Detect valid immediates. */
8864 if ((GET_MODE_CLASS (mode
) == MODE_INT
8865 || (GET_MODE_CLASS (mode
) == MODE_CC
8866 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8867 && CONST_INT_P (op1
)
8868 && aarch64_uimm12_shift (INTVAL (op1
)))
8871 /* SUB(S) (immediate). */
8872 *cost
+= extra_cost
->alu
.arith
;
8876 /* Look for SUB (extended register). */
8877 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8878 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8881 *cost
+= extra_cost
->alu
.extend_arith
;
8883 op1
= aarch64_strip_extend (op1
, true);
8884 *cost
+= rtx_cost (op1
, VOIDmode
,
8885 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8889 rtx new_op1
= aarch64_strip_extend (op1
, false);
8891 /* Cost this as an FMA-alike operation. */
8892 if ((GET_CODE (new_op1
) == MULT
8893 || aarch64_shift_p (GET_CODE (new_op1
)))
8896 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8897 (enum rtx_code
) code
,
8902 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8906 if (VECTOR_MODE_P (mode
))
8909 *cost
+= extra_cost
->vect
.alu
;
8911 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8914 *cost
+= extra_cost
->alu
.arith
;
8916 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8919 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8933 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8934 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8937 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8938 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8942 if (GET_MODE_CLASS (mode
) == MODE_INT
8943 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8944 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8946 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8949 /* ADD (immediate). */
8950 *cost
+= extra_cost
->alu
.arith
;
8954 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8956 /* Look for ADD (extended register). */
8957 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8958 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8961 *cost
+= extra_cost
->alu
.extend_arith
;
8963 op0
= aarch64_strip_extend (op0
, true);
8964 *cost
+= rtx_cost (op0
, VOIDmode
,
8965 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8969 /* Strip any extend, leave shifts behind as we will
8970 cost them through mult_cost. */
8971 new_op0
= aarch64_strip_extend (op0
, false);
8973 if (GET_CODE (new_op0
) == MULT
8974 || aarch64_shift_p (GET_CODE (new_op0
)))
8976 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8981 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8985 if (VECTOR_MODE_P (mode
))
8988 *cost
+= extra_cost
->vect
.alu
;
8990 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8993 *cost
+= extra_cost
->alu
.arith
;
8995 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8998 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9005 *cost
= COSTS_N_INSNS (1);
9009 if (VECTOR_MODE_P (mode
))
9010 *cost
+= extra_cost
->vect
.alu
;
9012 *cost
+= extra_cost
->alu
.rev
;
9017 if (aarch_rev16_p (x
))
9019 *cost
= COSTS_N_INSNS (1);
9023 if (VECTOR_MODE_P (mode
))
9024 *cost
+= extra_cost
->vect
.alu
;
9026 *cost
+= extra_cost
->alu
.rev
;
9031 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
9033 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
9034 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
9036 *cost
+= extra_cost
->alu
.shift
;
9047 if (VECTOR_MODE_P (mode
))
9050 *cost
+= extra_cost
->vect
.alu
;
9055 && GET_CODE (op0
) == MULT
9056 && CONST_INT_P (XEXP (op0
, 1))
9057 && CONST_INT_P (op1
)
9058 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
9061 /* This is a UBFM/SBFM. */
9062 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
9064 *cost
+= extra_cost
->alu
.bfx
;
9068 if (is_int_mode (mode
, &int_mode
))
9070 if (CONST_INT_P (op1
))
9072 /* We have a mask + shift version of a UBFIZ
9073 i.e. the *andim_ashift<mode>_bfiz pattern. */
9074 if (GET_CODE (op0
) == ASHIFT
9075 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
9078 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
9079 (enum rtx_code
) code
, 0, speed
);
9081 *cost
+= extra_cost
->alu
.bfx
;
9085 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
9087 /* We possibly get the immediate for free, this is not
9089 *cost
+= rtx_cost (op0
, int_mode
,
9090 (enum rtx_code
) code
, 0, speed
);
9092 *cost
+= extra_cost
->alu
.logical
;
9101 /* Handle ORN, EON, or BIC. */
9102 if (GET_CODE (op0
) == NOT
)
9103 op0
= XEXP (op0
, 0);
9105 new_op0
= aarch64_strip_shift (op0
);
9107 /* If we had a shift on op0 then this is a logical-shift-
9108 by-register/immediate operation. Otherwise, this is just
9109 a logical operation. */
9114 /* Shift by immediate. */
9115 if (CONST_INT_P (XEXP (op0
, 1)))
9116 *cost
+= extra_cost
->alu
.log_shift
;
9118 *cost
+= extra_cost
->alu
.log_shift_reg
;
9121 *cost
+= extra_cost
->alu
.logical
;
9124 /* In both cases we want to cost both operands. */
9125 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9127 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9137 op0
= aarch64_strip_shift (x
);
9139 if (VECTOR_MODE_P (mode
))
9142 *cost
+= extra_cost
->vect
.alu
;
9146 /* MVN-shifted-reg. */
9149 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9152 *cost
+= extra_cost
->alu
.log_shift
;
9156 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9157 Handle the second form here taking care that 'a' in the above can
9159 else if (GET_CODE (op0
) == XOR
)
9161 rtx newop0
= XEXP (op0
, 0);
9162 rtx newop1
= XEXP (op0
, 1);
9163 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9165 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9166 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9170 if (op0_stripped
!= newop0
)
9171 *cost
+= extra_cost
->alu
.log_shift
;
9173 *cost
+= extra_cost
->alu
.logical
;
9180 *cost
+= extra_cost
->alu
.logical
;
9187 /* If a value is written in SI mode, then zero extended to DI
9188 mode, the operation will in general be free as a write to
9189 a 'w' register implicitly zeroes the upper bits of an 'x'
9190 register. However, if this is
9192 (set (reg) (zero_extend (reg)))
9194 we must cost the explicit register move. */
9196 && GET_MODE (op0
) == SImode
9199 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9201 /* If OP_COST is non-zero, then the cost of the zero extend
9202 is effectively the cost of the inner operation. Otherwise
9203 we have a MOV instruction and we take the cost from the MOV
9204 itself. This is true independently of whether we are
9205 optimizing for space or time. */
9211 else if (MEM_P (op0
))
9213 /* All loads can zero extend to any size for free. */
9214 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9218 op0
= aarch64_extend_bitfield_pattern_p (x
);
9221 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9223 *cost
+= extra_cost
->alu
.bfx
;
9229 if (VECTOR_MODE_P (mode
))
9232 *cost
+= extra_cost
->vect
.alu
;
9236 /* We generate an AND instead of UXTB/UXTH. */
9237 *cost
+= extra_cost
->alu
.logical
;
9243 if (MEM_P (XEXP (x
, 0)))
9248 rtx address
= XEXP (XEXP (x
, 0), 0);
9249 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9252 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9258 op0
= aarch64_extend_bitfield_pattern_p (x
);
9261 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9263 *cost
+= extra_cost
->alu
.bfx
;
9269 if (VECTOR_MODE_P (mode
))
9270 *cost
+= extra_cost
->vect
.alu
;
9272 *cost
+= extra_cost
->alu
.extend
;
9280 if (CONST_INT_P (op1
))
9284 if (VECTOR_MODE_P (mode
))
9286 /* Vector shift (immediate). */
9287 *cost
+= extra_cost
->vect
.alu
;
9291 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9293 *cost
+= extra_cost
->alu
.shift
;
9297 /* We can incorporate zero/sign extend for free. */
9298 if (GET_CODE (op0
) == ZERO_EXTEND
9299 || GET_CODE (op0
) == SIGN_EXTEND
)
9300 op0
= XEXP (op0
, 0);
9302 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9307 if (VECTOR_MODE_P (mode
))
9310 /* Vector shift (register). */
9311 *cost
+= extra_cost
->vect
.alu
;
9317 *cost
+= extra_cost
->alu
.shift_reg
;
9319 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9320 && CONST_INT_P (XEXP (op1
, 1))
9321 && known_eq (INTVAL (XEXP (op1
, 1)),
9322 GET_MODE_BITSIZE (mode
) - 1))
9324 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9325 /* We already demanded XEXP (op1, 0) to be REG_P, so
9326 don't recurse into it. */
9330 return false; /* All arguments need to be in registers. */
9340 if (CONST_INT_P (op1
))
9342 /* ASR (immediate) and friends. */
9345 if (VECTOR_MODE_P (mode
))
9346 *cost
+= extra_cost
->vect
.alu
;
9348 *cost
+= extra_cost
->alu
.shift
;
9351 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9356 if (VECTOR_MODE_P (mode
))
9359 /* Vector shift (register). */
9360 *cost
+= extra_cost
->vect
.alu
;
9365 /* ASR (register) and friends. */
9366 *cost
+= extra_cost
->alu
.shift_reg
;
9368 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9369 && CONST_INT_P (XEXP (op1
, 1))
9370 && known_eq (INTVAL (XEXP (op1
, 1)),
9371 GET_MODE_BITSIZE (mode
) - 1))
9373 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9374 /* We already demanded XEXP (op1, 0) to be REG_P, so
9375 don't recurse into it. */
9379 return false; /* All arguments need to be in registers. */
9384 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9385 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9389 *cost
+= extra_cost
->ldst
.load
;
9391 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9392 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9394 /* ADRP, followed by ADD. */
9395 *cost
+= COSTS_N_INSNS (1);
9397 *cost
+= 2 * extra_cost
->alu
.arith
;
9399 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9400 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9404 *cost
+= extra_cost
->alu
.arith
;
9409 /* One extra load instruction, after accessing the GOT. */
9410 *cost
+= COSTS_N_INSNS (1);
9412 *cost
+= extra_cost
->ldst
.load
;
9418 /* ADRP/ADD (immediate). */
9420 *cost
+= extra_cost
->alu
.arith
;
9428 if (VECTOR_MODE_P (mode
))
9429 *cost
+= extra_cost
->vect
.alu
;
9431 *cost
+= extra_cost
->alu
.bfx
;
9434 /* We can trust that the immediates used will be correct (there
9435 are no by-register forms), so we need only cost op0. */
9436 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9440 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9441 /* aarch64_rtx_mult_cost always handles recursion to its
9446 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9447 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9448 an unconditional negate. This case should only ever be reached through
9449 the set_smod_pow2_cheap check in expmed.c. */
9450 if (CONST_INT_P (XEXP (x
, 1))
9451 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9452 && (mode
== SImode
|| mode
== DImode
))
9454 /* We expand to 4 instructions. Reset the baseline. */
9455 *cost
= COSTS_N_INSNS (4);
9458 *cost
+= 2 * extra_cost
->alu
.logical
9459 + 2 * extra_cost
->alu
.arith
;
9468 /* Slighly prefer UMOD over SMOD. */
9469 if (VECTOR_MODE_P (mode
))
9470 *cost
+= extra_cost
->vect
.alu
;
9471 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9472 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9473 + extra_cost
->mult
[mode
== DImode
].idiv
9474 + (code
== MOD
? 1 : 0));
9476 return false; /* All arguments need to be in registers. */
9483 if (VECTOR_MODE_P (mode
))
9484 *cost
+= extra_cost
->vect
.alu
;
9485 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9486 /* There is no integer SQRT, so only DIV and UDIV can get
9488 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9489 /* Slighly prefer UDIV over SDIV. */
9490 + (code
== DIV
? 1 : 0));
9492 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9494 return false; /* All arguments need to be in registers. */
9497 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9498 XEXP (x
, 2), cost
, speed
);
9511 return false; /* All arguments must be in registers. */
9520 if (VECTOR_MODE_P (mode
))
9521 *cost
+= extra_cost
->vect
.alu
;
9523 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9526 /* FMSUB, FNMADD, and FNMSUB are free. */
9527 if (GET_CODE (op0
) == NEG
)
9528 op0
= XEXP (op0
, 0);
9530 if (GET_CODE (op2
) == NEG
)
9531 op2
= XEXP (op2
, 0);
9533 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9534 and the by-element operand as operand 0. */
9535 if (GET_CODE (op1
) == NEG
)
9536 op1
= XEXP (op1
, 0);
9538 /* Catch vector-by-element operations. The by-element operand can
9539 either be (vec_duplicate (vec_select (x))) or just
9540 (vec_select (x)), depending on whether we are multiplying by
9541 a vector or a scalar.
9543 Canonicalization is not very good in these cases, FMA4 will put the
9544 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9545 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9546 op0
= XEXP (op0
, 0);
9547 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9548 op1
= XEXP (op1
, 0);
9550 if (GET_CODE (op0
) == VEC_SELECT
)
9551 op0
= XEXP (op0
, 0);
9552 else if (GET_CODE (op1
) == VEC_SELECT
)
9553 op1
= XEXP (op1
, 0);
9555 /* If the remaining parameters are not registers,
9556 get the cost to put them into registers. */
9557 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9558 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9559 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9563 case UNSIGNED_FLOAT
:
9565 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9571 if (VECTOR_MODE_P (mode
))
9573 /*Vector truncate. */
9574 *cost
+= extra_cost
->vect
.alu
;
9577 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9581 case FLOAT_TRUNCATE
:
9584 if (VECTOR_MODE_P (mode
))
9586 /*Vector conversion. */
9587 *cost
+= extra_cost
->vect
.alu
;
9590 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9597 /* Strip the rounding part. They will all be implemented
9598 by the fcvt* family of instructions anyway. */
9599 if (GET_CODE (x
) == UNSPEC
)
9601 unsigned int uns_code
= XINT (x
, 1);
9603 if (uns_code
== UNSPEC_FRINTA
9604 || uns_code
== UNSPEC_FRINTM
9605 || uns_code
== UNSPEC_FRINTN
9606 || uns_code
== UNSPEC_FRINTP
9607 || uns_code
== UNSPEC_FRINTZ
)
9608 x
= XVECEXP (x
, 0, 0);
9613 if (VECTOR_MODE_P (mode
))
9614 *cost
+= extra_cost
->vect
.alu
;
9616 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9619 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9620 fixed-point fcvt. */
9621 if (GET_CODE (x
) == MULT
9622 && ((VECTOR_MODE_P (mode
)
9623 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9624 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9626 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9631 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9635 if (VECTOR_MODE_P (mode
))
9639 *cost
+= extra_cost
->vect
.alu
;
9641 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9645 /* FABD, which is analogous to FADD. */
9646 if (GET_CODE (op0
) == MINUS
)
9648 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9649 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9651 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9655 /* Simple FABS is analogous to FNEG. */
9657 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9661 /* Integer ABS will either be split to
9662 two arithmetic instructions, or will be an ABS
9663 (scalar), which we don't model. */
9664 *cost
= COSTS_N_INSNS (2);
9666 *cost
+= 2 * extra_cost
->alu
.arith
;
9674 if (VECTOR_MODE_P (mode
))
9675 *cost
+= extra_cost
->vect
.alu
;
9678 /* FMAXNM/FMINNM/FMAX/FMIN.
9679 TODO: This may not be accurate for all implementations, but
9680 we do not model this in the cost tables. */
9681 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9687 /* The floating point round to integer frint* instructions. */
9688 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9691 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9696 if (XINT (x
, 1) == UNSPEC_RBIT
)
9699 *cost
+= extra_cost
->alu
.rev
;
9707 /* Decompose <su>muldi3_highpart. */
9708 if (/* (truncate:DI */
9711 && GET_MODE (XEXP (x
, 0)) == TImode
9712 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9714 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9715 /* (ANY_EXTEND:TI (reg:DI))
9716 (ANY_EXTEND:TI (reg:DI))) */
9717 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9718 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9719 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9720 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9721 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9722 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9723 /* (const_int 64) */
9724 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9725 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9729 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9730 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9731 mode
, MULT
, 0, speed
);
9732 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9733 mode
, MULT
, 1, speed
);
9743 && flag_aarch64_verbose_cost
)
9745 "\nFailed to cost RTX. Assuming default cost.\n");
9750 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9751 calculated for X. This cost is stored in *COST. Returns true
9752 if the total cost of X was calculated. */
9754 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9755 int param
, int *cost
, bool speed
)
9757 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9760 && flag_aarch64_verbose_cost
)
9762 print_rtl_single (dump_file
, x
);
9763 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9764 speed
? "Hot" : "Cold",
9765 *cost
, result
? "final" : "partial");
9772 aarch64_register_move_cost (machine_mode mode
,
9773 reg_class_t from_i
, reg_class_t to_i
)
9775 enum reg_class from
= (enum reg_class
) from_i
;
9776 enum reg_class to
= (enum reg_class
) to_i
;
9777 const struct cpu_regmove_cost
*regmove_cost
9778 = aarch64_tune_params
.regmove_cost
;
9780 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9781 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
9784 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
9785 from
= GENERAL_REGS
;
9787 /* Moving between GPR and stack cost is the same as GP2GP. */
9788 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9789 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9790 return regmove_cost
->GP2GP
;
9792 /* To/From the stack register, we move via the gprs. */
9793 if (to
== STACK_REG
|| from
== STACK_REG
)
9794 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9795 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9797 if (known_eq (GET_MODE_SIZE (mode
), 16))
9799 /* 128-bit operations on general registers require 2 instructions. */
9800 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9801 return regmove_cost
->GP2GP
* 2;
9802 else if (from
== GENERAL_REGS
)
9803 return regmove_cost
->GP2FP
* 2;
9804 else if (to
== GENERAL_REGS
)
9805 return regmove_cost
->FP2GP
* 2;
9807 /* When AdvSIMD instructions are disabled it is not possible to move
9808 a 128-bit value directly between Q registers. This is handled in
9809 secondary reload. A general register is used as a scratch to move
9810 the upper DI value and the lower DI value is moved directly,
9811 hence the cost is the sum of three moves. */
9813 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9815 return regmove_cost
->FP2FP
;
9818 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9819 return regmove_cost
->GP2GP
;
9820 else if (from
== GENERAL_REGS
)
9821 return regmove_cost
->GP2FP
;
9822 else if (to
== GENERAL_REGS
)
9823 return regmove_cost
->FP2GP
;
9825 return regmove_cost
->FP2FP
;
9829 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9830 reg_class_t rclass ATTRIBUTE_UNUSED
,
9831 bool in ATTRIBUTE_UNUSED
)
9833 return aarch64_tune_params
.memmov_cost
;
9836 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9837 to optimize 1.0/sqrt. */
9840 use_rsqrt_p (machine_mode mode
)
9842 return (!flag_trapping_math
9843 && flag_unsafe_math_optimizations
9844 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9845 & AARCH64_APPROX_MODE (mode
))
9846 || flag_mrecip_low_precision_sqrt
));
9849 /* Function to decide when to use the approximate reciprocal square root
9853 aarch64_builtin_reciprocal (tree fndecl
)
9855 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9857 if (!use_rsqrt_p (mode
))
9859 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9862 typedef rtx (*rsqrte_type
) (rtx
, rtx
);
9864 /* Select reciprocal square root initial estimate insn depending on machine
9868 get_rsqrte_type (machine_mode mode
)
9872 case E_DFmode
: return gen_aarch64_rsqrtedf
;
9873 case E_SFmode
: return gen_aarch64_rsqrtesf
;
9874 case E_V2DFmode
: return gen_aarch64_rsqrtev2df
;
9875 case E_V2SFmode
: return gen_aarch64_rsqrtev2sf
;
9876 case E_V4SFmode
: return gen_aarch64_rsqrtev4sf
;
9877 default: gcc_unreachable ();
9881 typedef rtx (*rsqrts_type
) (rtx
, rtx
, rtx
);
9883 /* Select reciprocal square root series step insn depending on machine mode. */
9886 get_rsqrts_type (machine_mode mode
)
9890 case E_DFmode
: return gen_aarch64_rsqrtsdf
;
9891 case E_SFmode
: return gen_aarch64_rsqrtssf
;
9892 case E_V2DFmode
: return gen_aarch64_rsqrtsv2df
;
9893 case E_V2SFmode
: return gen_aarch64_rsqrtsv2sf
;
9894 case E_V4SFmode
: return gen_aarch64_rsqrtsv4sf
;
9895 default: gcc_unreachable ();
9899 /* Emit instruction sequence to compute either the approximate square root
9900 or its approximate reciprocal, depending on the flag RECP, and return
9901 whether the sequence was emitted or not. */
9904 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9906 machine_mode mode
= GET_MODE (dst
);
9908 if (GET_MODE_INNER (mode
) == HFmode
)
9916 if (!(flag_mlow_precision_sqrt
9917 || (aarch64_tune_params
.approx_modes
->sqrt
9918 & AARCH64_APPROX_MODE (mode
))))
9921 if (flag_finite_math_only
9922 || flag_trapping_math
9923 || !flag_unsafe_math_optimizations
9924 || optimize_function_for_size_p (cfun
))
9928 /* Caller assumes we cannot fail. */
9929 gcc_assert (use_rsqrt_p (mode
));
9931 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9932 rtx xmsk
= gen_reg_rtx (mmsk
);
9934 /* When calculating the approximate square root, compare the
9935 argument with 0.0 and create a mask. */
9936 emit_insn (gen_rtx_SET (xmsk
,
9938 gen_rtx_EQ (mmsk
, src
,
9939 CONST0_RTX (mode
)))));
9941 /* Estimate the approximate reciprocal square root. */
9942 rtx xdst
= gen_reg_rtx (mode
);
9943 emit_insn ((*get_rsqrte_type (mode
)) (xdst
, src
));
9945 /* Iterate over the series twice for SF and thrice for DF. */
9946 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9948 /* Optionally iterate over the series once less for faster performance
9949 while sacrificing the accuracy. */
9950 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9951 || (!recp
&& flag_mlow_precision_sqrt
))
9954 /* Iterate over the series to calculate the approximate reciprocal square
9956 rtx x1
= gen_reg_rtx (mode
);
9957 while (iterations
--)
9959 rtx x2
= gen_reg_rtx (mode
);
9960 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9962 emit_insn ((*get_rsqrts_type (mode
)) (x1
, src
, x2
));
9965 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9970 /* Qualify the approximate reciprocal square root when the argument is
9971 0.0 by squashing the intermediary result to 0.0. */
9972 rtx xtmp
= gen_reg_rtx (mmsk
);
9973 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9974 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9975 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9977 /* Calculate the approximate square root. */
9978 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9981 /* Finalize the approximation. */
9982 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
9987 typedef rtx (*recpe_type
) (rtx
, rtx
);
9989 /* Select reciprocal initial estimate insn depending on machine mode. */
9992 get_recpe_type (machine_mode mode
)
9996 case E_SFmode
: return (gen_aarch64_frecpesf
);
9997 case E_V2SFmode
: return (gen_aarch64_frecpev2sf
);
9998 case E_V4SFmode
: return (gen_aarch64_frecpev4sf
);
9999 case E_DFmode
: return (gen_aarch64_frecpedf
);
10000 case E_V2DFmode
: return (gen_aarch64_frecpev2df
);
10001 default: gcc_unreachable ();
10005 typedef rtx (*recps_type
) (rtx
, rtx
, rtx
);
10007 /* Select reciprocal series step insn depending on machine mode. */
10010 get_recps_type (machine_mode mode
)
10014 case E_SFmode
: return (gen_aarch64_frecpssf
);
10015 case E_V2SFmode
: return (gen_aarch64_frecpsv2sf
);
10016 case E_V4SFmode
: return (gen_aarch64_frecpsv4sf
);
10017 case E_DFmode
: return (gen_aarch64_frecpsdf
);
10018 case E_V2DFmode
: return (gen_aarch64_frecpsv2df
);
10019 default: gcc_unreachable ();
10023 /* Emit the instruction sequence to compute the approximation for the division
10024 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10027 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
10029 machine_mode mode
= GET_MODE (quo
);
10031 if (GET_MODE_INNER (mode
) == HFmode
)
10034 bool use_approx_division_p
= (flag_mlow_precision_div
10035 || (aarch64_tune_params
.approx_modes
->division
10036 & AARCH64_APPROX_MODE (mode
)));
10038 if (!flag_finite_math_only
10039 || flag_trapping_math
10040 || !flag_unsafe_math_optimizations
10041 || optimize_function_for_size_p (cfun
)
10042 || !use_approx_division_p
)
10045 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
10048 /* Estimate the approximate reciprocal. */
10049 rtx xrcp
= gen_reg_rtx (mode
);
10050 emit_insn ((*get_recpe_type (mode
)) (xrcp
, den
));
10052 /* Iterate over the series twice for SF and thrice for DF. */
10053 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
10055 /* Optionally iterate over the series once less for faster performance,
10056 while sacrificing the accuracy. */
10057 if (flag_mlow_precision_div
)
10060 /* Iterate over the series to calculate the approximate reciprocal. */
10061 rtx xtmp
= gen_reg_rtx (mode
);
10062 while (iterations
--)
10064 emit_insn ((*get_recps_type (mode
)) (xtmp
, xrcp
, den
));
10066 if (iterations
> 0)
10067 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10070 if (num
!= CONST1_RTX (mode
))
10072 /* As the approximate reciprocal of DEN is already calculated, only
10073 calculate the approximate division when NUM is not 1.0. */
10074 rtx xnum
= force_reg (mode
, num
);
10075 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
10078 /* Finalize the approximation. */
10079 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
10083 /* Return the number of instructions that can be issued per cycle. */
10085 aarch64_sched_issue_rate (void)
10087 return aarch64_tune_params
.issue_rate
;
10091 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10093 int issue_rate
= aarch64_sched_issue_rate ();
10095 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
10099 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10100 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10101 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10104 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
10107 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
10111 /* Vectorizer cost model target hooks. */
10113 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10115 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
10117 int misalign ATTRIBUTE_UNUSED
)
10120 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
10123 if (vectype
!= NULL
)
10124 fp
= FLOAT_TYPE_P (vectype
);
10126 switch (type_of_cost
)
10129 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
10132 return costs
->scalar_load_cost
;
10135 return costs
->scalar_store_cost
;
10138 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10141 return costs
->vec_align_load_cost
;
10144 return costs
->vec_store_cost
;
10146 case vec_to_scalar
:
10147 return costs
->vec_to_scalar_cost
;
10149 case scalar_to_vec
:
10150 return costs
->scalar_to_vec_cost
;
10152 case unaligned_load
:
10153 case vector_gather_load
:
10154 return costs
->vec_unalign_load_cost
;
10156 case unaligned_store
:
10157 case vector_scatter_store
:
10158 return costs
->vec_unalign_store_cost
;
10160 case cond_branch_taken
:
10161 return costs
->cond_taken_branch_cost
;
10163 case cond_branch_not_taken
:
10164 return costs
->cond_not_taken_branch_cost
;
10167 return costs
->vec_permute_cost
;
10169 case vec_promote_demote
:
10170 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10172 case vec_construct
:
10173 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10174 return elements
/ 2 + 1;
10177 gcc_unreachable ();
10181 /* Implement targetm.vectorize.add_stmt_cost. */
10183 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10184 struct _stmt_vec_info
*stmt_info
, int misalign
,
10185 enum vect_cost_model_location where
)
10187 unsigned *cost
= (unsigned *) data
;
10188 unsigned retval
= 0;
10190 if (flag_vect_cost_model
)
10192 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10194 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10196 /* Statements in an inner loop relative to the loop being
10197 vectorized are weighted more heavily. The value here is
10198 arbitrary and could potentially be improved with analysis. */
10199 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10200 count
*= 50; /* FIXME */
10202 retval
= (unsigned) (count
* stmt_cost
);
10203 cost
[where
] += retval
;
10209 static void initialize_aarch64_code_model (struct gcc_options
*);
10211 /* Parse the TO_PARSE string and put the architecture struct that it
10212 selects into RES and the architectural features into ISA_FLAGS.
10213 Return an aarch64_parse_opt_result describing the parse result.
10214 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10216 static enum aarch64_parse_opt_result
10217 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10218 unsigned long *isa_flags
)
10221 const struct processor
*arch
;
10222 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10225 strcpy (str
, to_parse
);
10227 ext
= strchr (str
, '+');
10232 len
= strlen (str
);
10235 return AARCH64_PARSE_MISSING_ARG
;
10238 /* Loop through the list of supported ARCHes to find a match. */
10239 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10241 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
10243 unsigned long isa_temp
= arch
->flags
;
10247 /* TO_PARSE string contains at least one extension. */
10248 enum aarch64_parse_opt_result ext_res
10249 = aarch64_parse_extension (ext
, &isa_temp
);
10251 if (ext_res
!= AARCH64_PARSE_OK
)
10254 /* Extension parsing was successful. Confirm the result
10255 arch and ISA flags. */
10257 *isa_flags
= isa_temp
;
10258 return AARCH64_PARSE_OK
;
10262 /* ARCH name not found in list. */
10263 return AARCH64_PARSE_INVALID_ARG
;
10266 /* Parse the TO_PARSE string and put the result tuning in RES and the
10267 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10268 describing the parse result. If there is an error parsing, RES and
10269 ISA_FLAGS are left unchanged. */
10271 static enum aarch64_parse_opt_result
10272 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10273 unsigned long *isa_flags
)
10276 const struct processor
*cpu
;
10277 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10280 strcpy (str
, to_parse
);
10282 ext
= strchr (str
, '+');
10287 len
= strlen (str
);
10290 return AARCH64_PARSE_MISSING_ARG
;
10293 /* Loop through the list of supported CPUs to find a match. */
10294 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10296 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
10298 unsigned long isa_temp
= cpu
->flags
;
10303 /* TO_PARSE string contains at least one extension. */
10304 enum aarch64_parse_opt_result ext_res
10305 = aarch64_parse_extension (ext
, &isa_temp
);
10307 if (ext_res
!= AARCH64_PARSE_OK
)
10310 /* Extension parsing was successfull. Confirm the result
10311 cpu and ISA flags. */
10313 *isa_flags
= isa_temp
;
10314 return AARCH64_PARSE_OK
;
10318 /* CPU name not found in list. */
10319 return AARCH64_PARSE_INVALID_ARG
;
10322 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10323 Return an aarch64_parse_opt_result describing the parse result.
10324 If the parsing fails the RES does not change. */
10326 static enum aarch64_parse_opt_result
10327 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10329 const struct processor
*cpu
;
10330 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10332 strcpy (str
, to_parse
);
10334 /* Loop through the list of supported CPUs to find a match. */
10335 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10337 if (strcmp (cpu
->name
, str
) == 0)
10340 return AARCH64_PARSE_OK
;
10344 /* CPU name not found in list. */
10345 return AARCH64_PARSE_INVALID_ARG
;
10348 /* Parse TOKEN, which has length LENGTH to see if it is an option
10349 described in FLAG. If it is, return the index bit for that fusion type.
10350 If not, error (printing OPTION_NAME) and return zero. */
10352 static unsigned int
10353 aarch64_parse_one_option_token (const char *token
,
10355 const struct aarch64_flag_desc
*flag
,
10356 const char *option_name
)
10358 for (; flag
->name
!= NULL
; flag
++)
10360 if (length
== strlen (flag
->name
)
10361 && !strncmp (flag
->name
, token
, length
))
10365 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10369 /* Parse OPTION which is a comma-separated list of flags to enable.
10370 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10371 default state we inherit from the CPU tuning structures. OPTION_NAME
10372 gives the top-level option we are parsing in the -moverride string,
10373 for use in error messages. */
10375 static unsigned int
10376 aarch64_parse_boolean_options (const char *option
,
10377 const struct aarch64_flag_desc
*flags
,
10378 unsigned int initial_state
,
10379 const char *option_name
)
10381 const char separator
= '.';
10382 const char* specs
= option
;
10383 const char* ntoken
= option
;
10384 unsigned int found_flags
= initial_state
;
10386 while ((ntoken
= strchr (specs
, separator
)))
10388 size_t token_length
= ntoken
- specs
;
10389 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10393 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10394 in the token stream, reset the supported operations. So:
10396 adrp+add.cmp+branch.none.adrp+add
10398 would have the result of turning on only adrp+add fusion. */
10402 found_flags
|= token_ops
;
10406 /* We ended with a comma, print something. */
10409 error ("%s string ill-formed\n", option_name
);
10413 /* We still have one more token to parse. */
10414 size_t token_length
= strlen (specs
);
10415 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10422 found_flags
|= token_ops
;
10423 return found_flags
;
10426 /* Support for overriding instruction fusion. */
10429 aarch64_parse_fuse_string (const char *fuse_string
,
10430 struct tune_params
*tune
)
10432 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10433 aarch64_fusible_pairs
,
10438 /* Support for overriding other tuning flags. */
10441 aarch64_parse_tune_string (const char *tune_string
,
10442 struct tune_params
*tune
)
10444 tune
->extra_tuning_flags
10445 = aarch64_parse_boolean_options (tune_string
,
10446 aarch64_tuning_flags
,
10447 tune
->extra_tuning_flags
,
10451 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10452 we understand. If it is, extract the option string and handoff to
10453 the appropriate function. */
10456 aarch64_parse_one_override_token (const char* token
,
10458 struct tune_params
*tune
)
10460 const struct aarch64_tuning_override_function
*fn
10461 = aarch64_tuning_override_functions
;
10463 const char *option_part
= strchr (token
, '=');
10466 error ("tuning string missing in option (%s)", token
);
10470 /* Get the length of the option name. */
10471 length
= option_part
- token
;
10472 /* Skip the '=' to get to the option string. */
10475 for (; fn
->name
!= NULL
; fn
++)
10477 if (!strncmp (fn
->name
, token
, length
))
10479 fn
->parse_override (option_part
, tune
);
10484 error ("unknown tuning option (%s)",token
);
10488 /* A checking mechanism for the implementation of the tls size. */
10491 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10493 if (aarch64_tls_size
== 0)
10494 aarch64_tls_size
= 24;
10496 switch (opts
->x_aarch64_cmodel_var
)
10498 case AARCH64_CMODEL_TINY
:
10499 /* Both the default and maximum TLS size allowed under tiny is 1M which
10500 needs two instructions to address, so we clamp the size to 24. */
10501 if (aarch64_tls_size
> 24)
10502 aarch64_tls_size
= 24;
10504 case AARCH64_CMODEL_SMALL
:
10505 /* The maximum TLS size allowed under small is 4G. */
10506 if (aarch64_tls_size
> 32)
10507 aarch64_tls_size
= 32;
10509 case AARCH64_CMODEL_LARGE
:
10510 /* The maximum TLS size allowed under large is 16E.
10511 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10512 if (aarch64_tls_size
> 48)
10513 aarch64_tls_size
= 48;
10516 gcc_unreachable ();
10522 /* Parse STRING looking for options in the format:
10523 string :: option:string
10524 option :: name=substring
10526 substring :: defined by option. */
10529 aarch64_parse_override_string (const char* input_string
,
10530 struct tune_params
* tune
)
10532 const char separator
= ':';
10533 size_t string_length
= strlen (input_string
) + 1;
10534 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10535 char *string
= string_root
;
10536 strncpy (string
, input_string
, string_length
);
10537 string
[string_length
- 1] = '\0';
10539 char* ntoken
= string
;
10541 while ((ntoken
= strchr (string
, separator
)))
10543 size_t token_length
= ntoken
- string
;
10544 /* Make this substring look like a string. */
10546 aarch64_parse_one_override_token (string
, token_length
, tune
);
10550 /* One last option to parse. */
10551 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10552 free (string_root
);
10557 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10559 /* PR 70044: We have to be careful about being called multiple times for the
10560 same function. This means all changes should be repeatable. */
10562 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10563 Disable the frame pointer flag so the mid-end will not use a frame
10564 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10565 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10566 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10567 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
10568 if (opts
->x_flag_omit_frame_pointer
== 0)
10569 opts
->x_flag_omit_frame_pointer
= 2;
10571 /* If not optimizing for size, set the default
10572 alignment to what the target wants. */
10573 if (!opts
->x_optimize_size
)
10575 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
10576 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
10577 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
10578 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
10579 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
10580 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
10583 /* We default to no pc-relative literal loads. */
10585 aarch64_pcrelative_literal_loads
= false;
10587 /* If -mpc-relative-literal-loads is set on the command line, this
10588 implies that the user asked for PC relative literal loads. */
10589 if (opts
->x_pcrelative_literal_loads
== 1)
10590 aarch64_pcrelative_literal_loads
= true;
10592 /* In the tiny memory model it makes no sense to disallow PC relative
10593 literal pool loads. */
10594 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10595 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10596 aarch64_pcrelative_literal_loads
= true;
10598 /* When enabling the lower precision Newton series for the square root, also
10599 enable it for the reciprocal square root, since the latter is an
10600 intermediary step for the former. */
10601 if (flag_mlow_precision_sqrt
)
10602 flag_mrecip_low_precision_sqrt
= true;
10605 /* 'Unpack' up the internal tuning structs and update the options
10606 in OPTS. The caller must have set up selected_tune and selected_arch
10607 as all the other target-specific codegen decisions are
10608 derived from them. */
10611 aarch64_override_options_internal (struct gcc_options
*opts
)
10613 aarch64_tune_flags
= selected_tune
->flags
;
10614 aarch64_tune
= selected_tune
->sched_core
;
10615 /* Make a copy of the tuning parameters attached to the core, which
10616 we may later overwrite. */
10617 aarch64_tune_params
= *(selected_tune
->tune
);
10618 aarch64_architecture_version
= selected_arch
->architecture_version
;
10620 if (opts
->x_aarch64_override_tune_string
)
10621 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10622 &aarch64_tune_params
);
10624 /* This target defaults to strict volatile bitfields. */
10625 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10626 opts
->x_flag_strict_volatile_bitfields
= 1;
10628 initialize_aarch64_code_model (opts
);
10629 initialize_aarch64_tls_size (opts
);
10631 int queue_depth
= 0;
10632 switch (aarch64_tune_params
.autoprefetcher_model
)
10634 case tune_params::AUTOPREFETCHER_OFF
:
10637 case tune_params::AUTOPREFETCHER_WEAK
:
10640 case tune_params::AUTOPREFETCHER_STRONG
:
10641 queue_depth
= max_insn_queue_index
+ 1;
10644 gcc_unreachable ();
10647 /* We don't mind passing in global_options_set here as we don't use
10648 the *options_set structs anyway. */
10649 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10651 opts
->x_param_values
,
10652 global_options_set
.x_param_values
);
10654 /* Set up parameters to be used in prefetching algorithm. Do not
10655 override the defaults unless we are tuning for a core we have
10656 researched values for. */
10657 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10658 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10659 aarch64_tune_params
.prefetch
->num_slots
,
10660 opts
->x_param_values
,
10661 global_options_set
.x_param_values
);
10662 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10663 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10664 aarch64_tune_params
.prefetch
->l1_cache_size
,
10665 opts
->x_param_values
,
10666 global_options_set
.x_param_values
);
10667 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10668 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10669 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10670 opts
->x_param_values
,
10671 global_options_set
.x_param_values
);
10672 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10673 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10674 aarch64_tune_params
.prefetch
->l2_cache_size
,
10675 opts
->x_param_values
,
10676 global_options_set
.x_param_values
);
10677 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
10678 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
10680 opts
->x_param_values
,
10681 global_options_set
.x_param_values
);
10682 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
10683 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
10684 aarch64_tune_params
.prefetch
->minimum_stride
,
10685 opts
->x_param_values
,
10686 global_options_set
.x_param_values
);
10688 /* Use the alternative scheduling-pressure algorithm by default. */
10689 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10690 opts
->x_param_values
,
10691 global_options_set
.x_param_values
);
10693 /* Enable sw prefetching at specified optimization level for
10694 CPUS that have prefetch. Lower optimization level threshold by 1
10695 when profiling is enabled. */
10696 if (opts
->x_flag_prefetch_loop_arrays
< 0
10697 && !opts
->x_optimize_size
10698 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10699 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10700 opts
->x_flag_prefetch_loop_arrays
= 1;
10702 aarch64_override_options_after_change_1 (opts
);
10705 /* Print a hint with a suggestion for a core or architecture name that
10706 most closely resembles what the user passed in STR. ARCH is true if
10707 the user is asking for an architecture name. ARCH is false if the user
10708 is asking for a core name. */
10711 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10713 auto_vec
<const char *> candidates
;
10714 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10715 for (; entry
->name
!= NULL
; entry
++)
10716 candidates
.safe_push (entry
->name
);
10718 #ifdef HAVE_LOCAL_CPU_DETECT
10719 /* Add also "native" as possible value. */
10721 candidates
.safe_push ("native");
10725 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10727 inform (input_location
, "valid arguments are: %s;"
10728 " did you mean %qs?", s
, hint
);
10730 inform (input_location
, "valid arguments are: %s", s
);
10735 /* Print a hint with a suggestion for a core name that most closely resembles
10736 what the user passed in STR. */
10739 aarch64_print_hint_for_core (const char *str
)
10741 aarch64_print_hint_for_core_or_arch (str
, false);
10744 /* Print a hint with a suggestion for an architecture name that most closely
10745 resembles what the user passed in STR. */
10748 aarch64_print_hint_for_arch (const char *str
)
10750 aarch64_print_hint_for_core_or_arch (str
, true);
10753 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10754 specified in STR and throw errors if appropriate. Put the results if
10755 they are valid in RES and ISA_FLAGS. Return whether the option is
10759 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10760 unsigned long *isa_flags
)
10762 enum aarch64_parse_opt_result parse_res
10763 = aarch64_parse_cpu (str
, res
, isa_flags
);
10765 if (parse_res
== AARCH64_PARSE_OK
)
10770 case AARCH64_PARSE_MISSING_ARG
:
10771 error ("missing cpu name in %<-mcpu=%s%>", str
);
10773 case AARCH64_PARSE_INVALID_ARG
:
10774 error ("unknown value %qs for -mcpu", str
);
10775 aarch64_print_hint_for_core (str
);
10777 case AARCH64_PARSE_INVALID_FEATURE
:
10778 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10781 gcc_unreachable ();
10787 /* Validate a command-line -march option. Parse the arch and extensions
10788 (if any) specified in STR and throw errors if appropriate. Put the
10789 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10790 option is valid. */
10793 aarch64_validate_march (const char *str
, const struct processor
**res
,
10794 unsigned long *isa_flags
)
10796 enum aarch64_parse_opt_result parse_res
10797 = aarch64_parse_arch (str
, res
, isa_flags
);
10799 if (parse_res
== AARCH64_PARSE_OK
)
10804 case AARCH64_PARSE_MISSING_ARG
:
10805 error ("missing arch name in %<-march=%s%>", str
);
10807 case AARCH64_PARSE_INVALID_ARG
:
10808 error ("unknown value %qs for -march", str
);
10809 aarch64_print_hint_for_arch (str
);
10811 case AARCH64_PARSE_INVALID_FEATURE
:
10812 error ("invalid feature modifier in %<-march=%s%>", str
);
10815 gcc_unreachable ();
10821 /* Validate a command-line -mtune option. Parse the cpu
10822 specified in STR and throw errors if appropriate. Put the
10823 result, if it is valid, in RES. Return whether the option is
10827 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10829 enum aarch64_parse_opt_result parse_res
10830 = aarch64_parse_tune (str
, res
);
10832 if (parse_res
== AARCH64_PARSE_OK
)
10837 case AARCH64_PARSE_MISSING_ARG
:
10838 error ("missing cpu name in %<-mtune=%s%>", str
);
10840 case AARCH64_PARSE_INVALID_ARG
:
10841 error ("unknown value %qs for -mtune", str
);
10842 aarch64_print_hint_for_core (str
);
10845 gcc_unreachable ();
10850 /* Return the CPU corresponding to the enum CPU.
10851 If it doesn't specify a cpu, return the default. */
10853 static const struct processor
*
10854 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10856 if (cpu
!= aarch64_none
)
10857 return &all_cores
[cpu
];
10859 /* The & 0x3f is to extract the bottom 6 bits that encode the
10860 default cpu as selected by the --with-cpu GCC configure option
10862 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10863 flags mechanism should be reworked to make it more sane. */
10864 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10867 /* Return the architecture corresponding to the enum ARCH.
10868 If it doesn't specify a valid architecture, return the default. */
10870 static const struct processor
*
10871 aarch64_get_arch (enum aarch64_arch arch
)
10873 if (arch
!= aarch64_no_arch
)
10874 return &all_architectures
[arch
];
10876 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10878 return &all_architectures
[cpu
->arch
];
10881 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10884 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10886 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10887 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10888 deciding which .md file patterns to use and when deciding whether
10889 something is a legitimate address or constant. */
10890 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10891 return poly_uint16 (2, 2);
10893 return (int) value
/ 64;
10896 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10897 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10898 tuning structs. In particular it must set selected_tune and
10899 aarch64_isa_flags that define the available ISA features and tuning
10900 decisions. It must also set selected_arch as this will be used to
10901 output the .arch asm tags for each function. */
10904 aarch64_override_options (void)
10906 unsigned long cpu_isa
= 0;
10907 unsigned long arch_isa
= 0;
10908 aarch64_isa_flags
= 0;
10910 bool valid_cpu
= true;
10911 bool valid_tune
= true;
10912 bool valid_arch
= true;
10914 selected_cpu
= NULL
;
10915 selected_arch
= NULL
;
10916 selected_tune
= NULL
;
10918 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10919 If either of -march or -mtune is given, they override their
10920 respective component of -mcpu. */
10921 if (aarch64_cpu_string
)
10922 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10925 if (aarch64_arch_string
)
10926 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10929 if (aarch64_tune_string
)
10930 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10932 /* If the user did not specify a processor, choose the default
10933 one for them. This will be the CPU set during configuration using
10934 --with-cpu, otherwise it is "generic". */
10939 selected_cpu
= &all_cores
[selected_arch
->ident
];
10940 aarch64_isa_flags
= arch_isa
;
10941 explicit_arch
= selected_arch
->arch
;
10945 /* Get default configure-time CPU. */
10946 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10947 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10951 explicit_tune_core
= selected_tune
->ident
;
10953 /* If both -mcpu and -march are specified check that they are architecturally
10954 compatible, warn if they're not and prefer the -march ISA flags. */
10955 else if (selected_arch
)
10957 if (selected_arch
->arch
!= selected_cpu
->arch
)
10959 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10960 all_architectures
[selected_cpu
->arch
].name
,
10961 selected_arch
->name
);
10963 aarch64_isa_flags
= arch_isa
;
10964 explicit_arch
= selected_arch
->arch
;
10965 explicit_tune_core
= selected_tune
? selected_tune
->ident
10966 : selected_cpu
->ident
;
10970 /* -mcpu but no -march. */
10971 aarch64_isa_flags
= cpu_isa
;
10972 explicit_tune_core
= selected_tune
? selected_tune
->ident
10973 : selected_cpu
->ident
;
10974 gcc_assert (selected_cpu
);
10975 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10976 explicit_arch
= selected_arch
->arch
;
10979 /* Set the arch as well as we will need it when outputing
10980 the .arch directive in assembly. */
10981 if (!selected_arch
)
10983 gcc_assert (selected_cpu
);
10984 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10987 if (!selected_tune
)
10988 selected_tune
= selected_cpu
;
10990 #ifndef HAVE_AS_MABI_OPTION
10991 /* The compiler may have been configured with 2.23.* binutils, which does
10992 not have support for ILP32. */
10994 error ("assembler does not support -mabi=ilp32");
10997 /* Convert -msve-vector-bits to a VG count. */
10998 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
11000 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
11001 sorry ("return address signing is only supported for -mabi=lp64");
11003 /* Make sure we properly set up the explicit options. */
11004 if ((aarch64_cpu_string
&& valid_cpu
)
11005 || (aarch64_tune_string
&& valid_tune
))
11006 gcc_assert (explicit_tune_core
!= aarch64_none
);
11008 if ((aarch64_cpu_string
&& valid_cpu
)
11009 || (aarch64_arch_string
&& valid_arch
))
11010 gcc_assert (explicit_arch
!= aarch64_no_arch
);
11012 aarch64_override_options_internal (&global_options
);
11014 /* Save these options as the default ones in case we push and pop them later
11015 while processing functions with potential target attributes. */
11016 target_option_default_node
= target_option_current_node
11017 = build_target_option_node (&global_options
);
11020 /* Implement targetm.override_options_after_change. */
11023 aarch64_override_options_after_change (void)
11025 aarch64_override_options_after_change_1 (&global_options
);
11028 static struct machine_function
*
11029 aarch64_init_machine_status (void)
11031 struct machine_function
*machine
;
11032 machine
= ggc_cleared_alloc
<machine_function
> ();
11037 aarch64_init_expanders (void)
11039 init_machine_status
= aarch64_init_machine_status
;
11042 /* A checking mechanism for the implementation of the various code models. */
11044 initialize_aarch64_code_model (struct gcc_options
*opts
)
11046 if (opts
->x_flag_pic
)
11048 switch (opts
->x_aarch64_cmodel_var
)
11050 case AARCH64_CMODEL_TINY
:
11051 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
11053 case AARCH64_CMODEL_SMALL
:
11054 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11055 aarch64_cmodel
= (flag_pic
== 2
11056 ? AARCH64_CMODEL_SMALL_PIC
11057 : AARCH64_CMODEL_SMALL_SPIC
);
11059 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
11062 case AARCH64_CMODEL_LARGE
:
11063 sorry ("code model %qs with -f%s", "large",
11064 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
11067 gcc_unreachable ();
11071 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
11074 /* Implement TARGET_OPTION_SAVE. */
11077 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
11079 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
11082 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11083 using the information saved in PTR. */
11086 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
11088 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
11089 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11090 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
11091 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11092 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
11094 aarch64_override_options_internal (opts
);
11097 /* Implement TARGET_OPTION_PRINT. */
11100 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
11102 const struct processor
*cpu
11103 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
11104 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
11105 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
11106 std::string extension
11107 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
11109 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
11110 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
11111 arch
->name
, extension
.c_str ());
11114 static GTY(()) tree aarch64_previous_fndecl
;
11117 aarch64_reset_previous_fndecl (void)
11119 aarch64_previous_fndecl
= NULL
;
11122 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11123 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11124 make sure optab availability predicates are recomputed when necessary. */
11127 aarch64_save_restore_target_globals (tree new_tree
)
11129 if (TREE_TARGET_GLOBALS (new_tree
))
11130 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
11131 else if (new_tree
== target_option_default_node
)
11132 restore_target_globals (&default_target_globals
);
11134 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
11137 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11138 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11139 of the function, if such exists. This function may be called multiple
11140 times on a single function so use aarch64_previous_fndecl to avoid
11141 setting up identical state. */
11144 aarch64_set_current_function (tree fndecl
)
11146 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
11149 tree old_tree
= (aarch64_previous_fndecl
11150 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
11153 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11155 /* If current function has no attributes but the previous one did,
11156 use the default node. */
11157 if (!new_tree
&& old_tree
)
11158 new_tree
= target_option_default_node
;
11160 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11161 the default have been handled by aarch64_save_restore_target_globals from
11162 aarch64_pragma_target_parse. */
11163 if (old_tree
== new_tree
)
11166 aarch64_previous_fndecl
= fndecl
;
11168 /* First set the target options. */
11169 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
11171 aarch64_save_restore_target_globals (new_tree
);
11174 /* Enum describing the various ways we can handle attributes.
11175 In many cases we can reuse the generic option handling machinery. */
11177 enum aarch64_attr_opt_type
11179 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
11180 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
11181 aarch64_attr_enum
, /* Attribute sets an enum variable. */
11182 aarch64_attr_custom
/* Attribute requires a custom handling function. */
11185 /* All the information needed to handle a target attribute.
11186 NAME is the name of the attribute.
11187 ATTR_TYPE specifies the type of behavior of the attribute as described
11188 in the definition of enum aarch64_attr_opt_type.
11189 ALLOW_NEG is true if the attribute supports a "no-" form.
11190 HANDLER is the function that takes the attribute string as an argument
11191 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11192 OPT_NUM is the enum specifying the option that the attribute modifies.
11193 This is needed for attributes that mirror the behavior of a command-line
11194 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11195 aarch64_attr_enum. */
11197 struct aarch64_attribute_info
11200 enum aarch64_attr_opt_type attr_type
;
11202 bool (*handler
) (const char *);
11203 enum opt_code opt_num
;
11206 /* Handle the ARCH_STR argument to the arch= target attribute. */
11209 aarch64_handle_attr_arch (const char *str
)
11211 const struct processor
*tmp_arch
= NULL
;
11212 enum aarch64_parse_opt_result parse_res
11213 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
11215 if (parse_res
== AARCH64_PARSE_OK
)
11217 gcc_assert (tmp_arch
);
11218 selected_arch
= tmp_arch
;
11219 explicit_arch
= selected_arch
->arch
;
11225 case AARCH64_PARSE_MISSING_ARG
:
11226 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11228 case AARCH64_PARSE_INVALID_ARG
:
11229 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
11230 aarch64_print_hint_for_arch (str
);
11232 case AARCH64_PARSE_INVALID_FEATURE
:
11233 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11236 gcc_unreachable ();
11242 /* Handle the argument CPU_STR to the cpu= target attribute. */
11245 aarch64_handle_attr_cpu (const char *str
)
11247 const struct processor
*tmp_cpu
= NULL
;
11248 enum aarch64_parse_opt_result parse_res
11249 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
11251 if (parse_res
== AARCH64_PARSE_OK
)
11253 gcc_assert (tmp_cpu
);
11254 selected_tune
= tmp_cpu
;
11255 explicit_tune_core
= selected_tune
->ident
;
11257 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
11258 explicit_arch
= selected_arch
->arch
;
11264 case AARCH64_PARSE_MISSING_ARG
:
11265 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11267 case AARCH64_PARSE_INVALID_ARG
:
11268 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
11269 aarch64_print_hint_for_core (str
);
11271 case AARCH64_PARSE_INVALID_FEATURE
:
11272 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11275 gcc_unreachable ();
11281 /* Handle the argument STR to the tune= target attribute. */
11284 aarch64_handle_attr_tune (const char *str
)
11286 const struct processor
*tmp_tune
= NULL
;
11287 enum aarch64_parse_opt_result parse_res
11288 = aarch64_parse_tune (str
, &tmp_tune
);
11290 if (parse_res
== AARCH64_PARSE_OK
)
11292 gcc_assert (tmp_tune
);
11293 selected_tune
= tmp_tune
;
11294 explicit_tune_core
= selected_tune
->ident
;
11300 case AARCH64_PARSE_INVALID_ARG
:
11301 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
11302 aarch64_print_hint_for_core (str
);
11305 gcc_unreachable ();
11311 /* Parse an architecture extensions target attribute string specified in STR.
11312 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11313 if successful. Update aarch64_isa_flags to reflect the ISA features
11317 aarch64_handle_attr_isa_flags (char *str
)
11319 enum aarch64_parse_opt_result parse_res
;
11320 unsigned long isa_flags
= aarch64_isa_flags
;
11322 /* We allow "+nothing" in the beginning to clear out all architectural
11323 features if the user wants to handpick specific features. */
11324 if (strncmp ("+nothing", str
, 8) == 0)
11330 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
11332 if (parse_res
== AARCH64_PARSE_OK
)
11334 aarch64_isa_flags
= isa_flags
;
11340 case AARCH64_PARSE_MISSING_ARG
:
11341 error ("missing value in %<target()%> pragma or attribute");
11344 case AARCH64_PARSE_INVALID_FEATURE
:
11345 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11349 gcc_unreachable ();
11355 /* The target attributes that we support. On top of these we also support just
11356 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11357 handled explicitly in aarch64_process_one_target_attr. */
11359 static const struct aarch64_attribute_info aarch64_attributes
[] =
11361 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11362 OPT_mgeneral_regs_only
},
11363 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11364 OPT_mfix_cortex_a53_835769
},
11365 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11366 OPT_mfix_cortex_a53_843419
},
11367 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11368 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
11369 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11370 OPT_momit_leaf_frame_pointer
},
11371 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11372 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11374 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11375 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11377 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11378 OPT_msign_return_address_
},
11379 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11382 /* Parse ARG_STR which contains the definition of one target attribute.
11383 Show appropriate errors if any or return true if the attribute is valid. */
11386 aarch64_process_one_target_attr (char *arg_str
)
11388 bool invert
= false;
11390 size_t len
= strlen (arg_str
);
11394 error ("malformed %<target()%> pragma or attribute");
11398 char *str_to_check
= (char *) alloca (len
+ 1);
11399 strcpy (str_to_check
, arg_str
);
11401 /* Skip leading whitespace. */
11402 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11405 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11406 It is easier to detect and handle it explicitly here rather than going
11407 through the machinery for the rest of the target attributes in this
11409 if (*str_to_check
== '+')
11410 return aarch64_handle_attr_isa_flags (str_to_check
);
11412 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11417 char *arg
= strchr (str_to_check
, '=');
11419 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11420 and point ARG to "foo". */
11426 const struct aarch64_attribute_info
*p_attr
;
11427 bool found
= false;
11428 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11430 /* If the names don't match up, or the user has given an argument
11431 to an attribute that doesn't accept one, or didn't give an argument
11432 to an attribute that expects one, fail to match. */
11433 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11437 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11438 || p_attr
->attr_type
== aarch64_attr_enum
;
11440 if (attr_need_arg_p
^ (arg
!= NULL
))
11442 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11446 /* If the name matches but the attribute does not allow "no-" versions
11447 then we can't match. */
11448 if (invert
&& !p_attr
->allow_neg
)
11450 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11454 switch (p_attr
->attr_type
)
11456 /* Has a custom handler registered.
11457 For example, cpu=, arch=, tune=. */
11458 case aarch64_attr_custom
:
11459 gcc_assert (p_attr
->handler
);
11460 if (!p_attr
->handler (arg
))
11464 /* Either set or unset a boolean option. */
11465 case aarch64_attr_bool
:
11467 struct cl_decoded_option decoded
;
11469 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11470 CL_TARGET
, &decoded
);
11471 aarch64_handle_option (&global_options
, &global_options_set
,
11472 &decoded
, input_location
);
11475 /* Set or unset a bit in the target_flags. aarch64_handle_option
11476 should know what mask to apply given the option number. */
11477 case aarch64_attr_mask
:
11479 struct cl_decoded_option decoded
;
11480 /* We only need to specify the option number.
11481 aarch64_handle_option will know which mask to apply. */
11482 decoded
.opt_index
= p_attr
->opt_num
;
11483 decoded
.value
= !invert
;
11484 aarch64_handle_option (&global_options
, &global_options_set
,
11485 &decoded
, input_location
);
11488 /* Use the option setting machinery to set an option to an enum. */
11489 case aarch64_attr_enum
:
11494 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11495 &value
, CL_TARGET
);
11498 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11499 NULL
, DK_UNSPECIFIED
, input_location
,
11504 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11509 gcc_unreachable ();
11513 /* If we reached here we either have found an attribute and validated
11514 it or didn't match any. If we matched an attribute but its arguments
11515 were malformed we will have returned false already. */
11519 /* Count how many times the character C appears in
11520 NULL-terminated string STR. */
11522 static unsigned int
11523 num_occurences_in_str (char c
, char *str
)
11525 unsigned int res
= 0;
11526 while (*str
!= '\0')
11537 /* Parse the tree in ARGS that contains the target attribute information
11538 and update the global target options space. */
11541 aarch64_process_target_attr (tree args
)
11543 if (TREE_CODE (args
) == TREE_LIST
)
11547 tree head
= TREE_VALUE (args
);
11550 if (!aarch64_process_target_attr (head
))
11553 args
= TREE_CHAIN (args
);
11559 if (TREE_CODE (args
) != STRING_CST
)
11561 error ("attribute %<target%> argument not a string");
11565 size_t len
= strlen (TREE_STRING_POINTER (args
));
11566 char *str_to_check
= (char *) alloca (len
+ 1);
11567 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11571 error ("malformed %<target()%> pragma or attribute");
11575 /* Used to catch empty spaces between commas i.e.
11576 attribute ((target ("attr1,,attr2"))). */
11577 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11579 /* Handle multiple target attributes separated by ','. */
11580 char *token
= strtok (str_to_check
, ",");
11582 unsigned int num_attrs
= 0;
11586 if (!aarch64_process_one_target_attr (token
))
11588 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11592 token
= strtok (NULL
, ",");
11595 if (num_attrs
!= num_commas
+ 1)
11597 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11604 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11605 process attribute ((target ("..."))). */
11608 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11610 struct cl_target_option cur_target
;
11613 tree new_target
, new_optimize
;
11614 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11616 /* If what we're processing is the current pragma string then the
11617 target option node is already stored in target_option_current_node
11618 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11619 having to re-parse the string. This is especially useful to keep
11620 arm_neon.h compile times down since that header contains a lot
11621 of intrinsics enclosed in pragmas. */
11622 if (!existing_target
&& args
== current_target_pragma
)
11624 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11627 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11629 old_optimize
= build_optimization_node (&global_options
);
11630 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11632 /* If the function changed the optimization levels as well as setting
11633 target options, start with the optimizations specified. */
11634 if (func_optimize
&& func_optimize
!= old_optimize
)
11635 cl_optimization_restore (&global_options
,
11636 TREE_OPTIMIZATION (func_optimize
));
11638 /* Save the current target options to restore at the end. */
11639 cl_target_option_save (&cur_target
, &global_options
);
11641 /* If fndecl already has some target attributes applied to it, unpack
11642 them so that we add this attribute on top of them, rather than
11643 overwriting them. */
11644 if (existing_target
)
11646 struct cl_target_option
*existing_options
11647 = TREE_TARGET_OPTION (existing_target
);
11649 if (existing_options
)
11650 cl_target_option_restore (&global_options
, existing_options
);
11653 cl_target_option_restore (&global_options
,
11654 TREE_TARGET_OPTION (target_option_current_node
));
11656 ret
= aarch64_process_target_attr (args
);
11658 /* Set up any additional state. */
11661 aarch64_override_options_internal (&global_options
);
11662 /* Initialize SIMD builtins if we haven't already.
11663 Set current_target_pragma to NULL for the duration so that
11664 the builtin initialization code doesn't try to tag the functions
11665 being built with the attributes specified by any current pragma, thus
11666 going into an infinite recursion. */
11669 tree saved_current_target_pragma
= current_target_pragma
;
11670 current_target_pragma
= NULL
;
11671 aarch64_init_simd_builtins ();
11672 current_target_pragma
= saved_current_target_pragma
;
11674 new_target
= build_target_option_node (&global_options
);
11679 new_optimize
= build_optimization_node (&global_options
);
11683 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11685 if (old_optimize
!= new_optimize
)
11686 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11689 cl_target_option_restore (&global_options
, &cur_target
);
11691 if (old_optimize
!= new_optimize
)
11692 cl_optimization_restore (&global_options
,
11693 TREE_OPTIMIZATION (old_optimize
));
11697 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11698 tri-bool options (yes, no, don't care) and the default value is
11699 DEF, determine whether to reject inlining. */
11702 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11703 int dont_care
, int def
)
11705 /* If the callee doesn't care, always allow inlining. */
11706 if (callee
== dont_care
)
11709 /* If the caller doesn't care, always allow inlining. */
11710 if (caller
== dont_care
)
11713 /* Otherwise, allow inlining if either the callee and caller values
11714 agree, or if the callee is using the default value. */
11715 return (callee
== caller
|| callee
== def
);
11718 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11719 to inline CALLEE into CALLER based on target-specific info.
11720 Make sure that the caller and callee have compatible architectural
11721 features. Then go through the other possible target attributes
11722 and see if they can block inlining. Try not to reject always_inline
11723 callees unless they are incompatible architecturally. */
11726 aarch64_can_inline_p (tree caller
, tree callee
)
11728 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11729 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11731 struct cl_target_option
*caller_opts
11732 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11733 : target_option_default_node
);
11735 struct cl_target_option
*callee_opts
11736 = TREE_TARGET_OPTION (callee_tree
? callee_tree
11737 : target_option_default_node
);
11739 /* Callee's ISA flags should be a subset of the caller's. */
11740 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11741 != callee_opts
->x_aarch64_isa_flags
)
11744 /* Allow non-strict aligned functions inlining into strict
11746 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11747 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11748 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11749 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11752 bool always_inline
= lookup_attribute ("always_inline",
11753 DECL_ATTRIBUTES (callee
));
11755 /* If the architectural features match up and the callee is always_inline
11756 then the other attributes don't matter. */
11760 if (caller_opts
->x_aarch64_cmodel_var
11761 != callee_opts
->x_aarch64_cmodel_var
)
11764 if (caller_opts
->x_aarch64_tls_dialect
11765 != callee_opts
->x_aarch64_tls_dialect
)
11768 /* Honour explicit requests to workaround errata. */
11769 if (!aarch64_tribools_ok_for_inlining_p (
11770 caller_opts
->x_aarch64_fix_a53_err835769
,
11771 callee_opts
->x_aarch64_fix_a53_err835769
,
11772 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11775 if (!aarch64_tribools_ok_for_inlining_p (
11776 caller_opts
->x_aarch64_fix_a53_err843419
,
11777 callee_opts
->x_aarch64_fix_a53_err843419
,
11778 2, TARGET_FIX_ERR_A53_843419
))
11781 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11782 caller and calle and they don't match up, reject inlining. */
11783 if (!aarch64_tribools_ok_for_inlining_p (
11784 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11785 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11789 /* If the callee has specific tuning overrides, respect them. */
11790 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11791 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11794 /* If the user specified tuning override strings for the
11795 caller and callee and they don't match up, reject inlining.
11796 We just do a string compare here, we don't analyze the meaning
11797 of the string, as it would be too costly for little gain. */
11798 if (callee_opts
->x_aarch64_override_tune_string
11799 && caller_opts
->x_aarch64_override_tune_string
11800 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11801 caller_opts
->x_aarch64_override_tune_string
) != 0))
11807 /* Return true if SYMBOL_REF X binds locally. */
11810 aarch64_symbol_binds_local_p (const_rtx x
)
11812 return (SYMBOL_REF_DECL (x
)
11813 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11814 : SYMBOL_REF_LOCAL_P (x
));
11817 /* Return true if SYMBOL_REF X is thread local */
11819 aarch64_tls_symbol_p (rtx x
)
11821 if (! TARGET_HAVE_TLS
)
11824 if (GET_CODE (x
) != SYMBOL_REF
)
11827 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11830 /* Classify a TLS symbol into one of the TLS kinds. */
11831 enum aarch64_symbol_type
11832 aarch64_classify_tls_symbol (rtx x
)
11834 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11838 case TLS_MODEL_GLOBAL_DYNAMIC
:
11839 case TLS_MODEL_LOCAL_DYNAMIC
:
11840 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11842 case TLS_MODEL_INITIAL_EXEC
:
11843 switch (aarch64_cmodel
)
11845 case AARCH64_CMODEL_TINY
:
11846 case AARCH64_CMODEL_TINY_PIC
:
11847 return SYMBOL_TINY_TLSIE
;
11849 return SYMBOL_SMALL_TLSIE
;
11852 case TLS_MODEL_LOCAL_EXEC
:
11853 if (aarch64_tls_size
== 12)
11854 return SYMBOL_TLSLE12
;
11855 else if (aarch64_tls_size
== 24)
11856 return SYMBOL_TLSLE24
;
11857 else if (aarch64_tls_size
== 32)
11858 return SYMBOL_TLSLE32
;
11859 else if (aarch64_tls_size
== 48)
11860 return SYMBOL_TLSLE48
;
11862 gcc_unreachable ();
11864 case TLS_MODEL_EMULATED
:
11865 case TLS_MODEL_NONE
:
11866 return SYMBOL_FORCE_TO_MEM
;
11869 gcc_unreachable ();
11873 /* Return the correct method for accessing X + OFFSET, where X is either
11874 a SYMBOL_REF or LABEL_REF. */
11876 enum aarch64_symbol_type
11877 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11879 if (GET_CODE (x
) == LABEL_REF
)
11881 switch (aarch64_cmodel
)
11883 case AARCH64_CMODEL_LARGE
:
11884 return SYMBOL_FORCE_TO_MEM
;
11886 case AARCH64_CMODEL_TINY_PIC
:
11887 case AARCH64_CMODEL_TINY
:
11888 return SYMBOL_TINY_ABSOLUTE
;
11890 case AARCH64_CMODEL_SMALL_SPIC
:
11891 case AARCH64_CMODEL_SMALL_PIC
:
11892 case AARCH64_CMODEL_SMALL
:
11893 return SYMBOL_SMALL_ABSOLUTE
;
11896 gcc_unreachable ();
11900 if (GET_CODE (x
) == SYMBOL_REF
)
11902 if (aarch64_tls_symbol_p (x
))
11903 return aarch64_classify_tls_symbol (x
);
11905 switch (aarch64_cmodel
)
11907 case AARCH64_CMODEL_TINY
:
11908 /* When we retrieve symbol + offset address, we have to make sure
11909 the offset does not cause overflow of the final address. But
11910 we have no way of knowing the address of symbol at compile time
11911 so we can't accurately say if the distance between the PC and
11912 symbol + offset is outside the addressible range of +/-1M in the
11913 TINY code model. So we rely on images not being greater than
11914 1M and cap the offset at 1M and anything beyond 1M will have to
11915 be loaded using an alternative mechanism. Furthermore if the
11916 symbol is a weak reference to something that isn't known to
11917 resolve to a symbol in this module, then force to memory. */
11918 if ((SYMBOL_REF_WEAK (x
)
11919 && !aarch64_symbol_binds_local_p (x
))
11920 || !IN_RANGE (offset
, -1048575, 1048575))
11921 return SYMBOL_FORCE_TO_MEM
;
11922 return SYMBOL_TINY_ABSOLUTE
;
11924 case AARCH64_CMODEL_SMALL
:
11925 /* Same reasoning as the tiny code model, but the offset cap here is
11927 if ((SYMBOL_REF_WEAK (x
)
11928 && !aarch64_symbol_binds_local_p (x
))
11929 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11930 HOST_WIDE_INT_C (4294967264)))
11931 return SYMBOL_FORCE_TO_MEM
;
11932 return SYMBOL_SMALL_ABSOLUTE
;
11934 case AARCH64_CMODEL_TINY_PIC
:
11935 if (!aarch64_symbol_binds_local_p (x
))
11936 return SYMBOL_TINY_GOT
;
11937 return SYMBOL_TINY_ABSOLUTE
;
11939 case AARCH64_CMODEL_SMALL_SPIC
:
11940 case AARCH64_CMODEL_SMALL_PIC
:
11941 if (!aarch64_symbol_binds_local_p (x
))
11942 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11943 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11944 return SYMBOL_SMALL_ABSOLUTE
;
11946 case AARCH64_CMODEL_LARGE
:
11947 /* This is alright even in PIC code as the constant
11948 pool reference is always PC relative and within
11949 the same translation unit. */
11950 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11951 return SYMBOL_SMALL_ABSOLUTE
;
11953 return SYMBOL_FORCE_TO_MEM
;
11956 gcc_unreachable ();
11960 /* By default push everything into the constant pool. */
11961 return SYMBOL_FORCE_TO_MEM
;
11965 aarch64_constant_address_p (rtx x
)
11967 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11971 aarch64_legitimate_pic_operand_p (rtx x
)
11973 if (GET_CODE (x
) == SYMBOL_REF
11974 || (GET_CODE (x
) == CONST
11975 && GET_CODE (XEXP (x
, 0)) == PLUS
11976 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11982 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11983 that should be rematerialized rather than spilled. */
11986 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
11988 /* Support CSE and rematerialization of common constants. */
11989 if (CONST_INT_P (x
)
11990 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11991 || GET_CODE (x
) == CONST_VECTOR
)
11994 /* Do not allow vector struct mode constants for Advanced SIMD.
11995 We could support 0 and -1 easily, but they need support in
11996 aarch64-simd.md. */
11997 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11998 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
12001 /* Only accept variable-length vector constants if they can be
12004 ??? It would be possible to handle rematerialization of other
12005 constants via secondary reloads. */
12006 if (vec_flags
& VEC_ANY_SVE
)
12007 return aarch64_simd_valid_immediate (x
, NULL
);
12009 if (GET_CODE (x
) == HIGH
)
12012 /* Accept polynomial constants that can be calculated by using the
12013 destination of a move as the sole temporary. Constants that
12014 require a second temporary cannot be rematerialized (they can't be
12015 forced to memory and also aren't legitimate constants). */
12017 if (poly_int_rtx_p (x
, &offset
))
12018 return aarch64_offset_temporaries (false, offset
) <= 1;
12020 /* If an offset is being added to something else, we need to allow the
12021 base to be moved into the destination register, meaning that there
12022 are no free temporaries for the offset. */
12023 x
= strip_offset (x
, &offset
);
12024 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
12027 /* Do not allow const (plus (anchor_symbol, const_int)). */
12028 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
12031 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12032 so spilling them is better than rematerialization. */
12033 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
12036 /* Label references are always constant. */
12037 if (GET_CODE (x
) == LABEL_REF
)
12044 aarch64_load_tp (rtx target
)
12047 || GET_MODE (target
) != Pmode
12048 || !register_operand (target
, Pmode
))
12049 target
= gen_reg_rtx (Pmode
);
12051 /* Can return in any reg. */
12052 emit_insn (gen_aarch64_load_tp_hard (target
));
12056 /* On AAPCS systems, this is the "struct __va_list". */
12057 static GTY(()) tree va_list_type
;
12059 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12060 Return the type to use as __builtin_va_list.
12062 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12074 aarch64_build_builtin_va_list (void)
12077 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12079 /* Create the type. */
12080 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
12081 /* Give it the required name. */
12082 va_list_name
= build_decl (BUILTINS_LOCATION
,
12084 get_identifier ("__va_list"),
12086 DECL_ARTIFICIAL (va_list_name
) = 1;
12087 TYPE_NAME (va_list_type
) = va_list_name
;
12088 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
12090 /* Create the fields. */
12091 f_stack
= build_decl (BUILTINS_LOCATION
,
12092 FIELD_DECL
, get_identifier ("__stack"),
12094 f_grtop
= build_decl (BUILTINS_LOCATION
,
12095 FIELD_DECL
, get_identifier ("__gr_top"),
12097 f_vrtop
= build_decl (BUILTINS_LOCATION
,
12098 FIELD_DECL
, get_identifier ("__vr_top"),
12100 f_groff
= build_decl (BUILTINS_LOCATION
,
12101 FIELD_DECL
, get_identifier ("__gr_offs"),
12102 integer_type_node
);
12103 f_vroff
= build_decl (BUILTINS_LOCATION
,
12104 FIELD_DECL
, get_identifier ("__vr_offs"),
12105 integer_type_node
);
12107 /* Tell tree-stdarg pass about our internal offset fields.
12108 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12109 purpose to identify whether the code is updating va_list internal
12110 offset fields through irregular way. */
12111 va_list_gpr_counter_field
= f_groff
;
12112 va_list_fpr_counter_field
= f_vroff
;
12114 DECL_ARTIFICIAL (f_stack
) = 1;
12115 DECL_ARTIFICIAL (f_grtop
) = 1;
12116 DECL_ARTIFICIAL (f_vrtop
) = 1;
12117 DECL_ARTIFICIAL (f_groff
) = 1;
12118 DECL_ARTIFICIAL (f_vroff
) = 1;
12120 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
12121 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
12122 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
12123 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
12124 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
12126 TYPE_FIELDS (va_list_type
) = f_stack
;
12127 DECL_CHAIN (f_stack
) = f_grtop
;
12128 DECL_CHAIN (f_grtop
) = f_vrtop
;
12129 DECL_CHAIN (f_vrtop
) = f_groff
;
12130 DECL_CHAIN (f_groff
) = f_vroff
;
12132 /* Compute its layout. */
12133 layout_type (va_list_type
);
12135 return va_list_type
;
12138 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12140 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
12142 const CUMULATIVE_ARGS
*cum
;
12143 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12144 tree stack
, grtop
, vrtop
, groff
, vroff
;
12146 int gr_save_area_size
= cfun
->va_list_gpr_size
;
12147 int vr_save_area_size
= cfun
->va_list_fpr_size
;
12150 cum
= &crtl
->args
.info
;
12151 if (cfun
->va_list_gpr_size
)
12152 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
12153 cfun
->va_list_gpr_size
);
12154 if (cfun
->va_list_fpr_size
)
12155 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
12156 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
12160 gcc_assert (cum
->aapcs_nvrn
== 0);
12161 vr_save_area_size
= 0;
12164 f_stack
= TYPE_FIELDS (va_list_type_node
);
12165 f_grtop
= DECL_CHAIN (f_stack
);
12166 f_vrtop
= DECL_CHAIN (f_grtop
);
12167 f_groff
= DECL_CHAIN (f_vrtop
);
12168 f_vroff
= DECL_CHAIN (f_groff
);
12170 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
12172 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
12174 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
12176 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
12178 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
12181 /* Emit code to initialize STACK, which points to the next varargs stack
12182 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12183 by named arguments. STACK is 8-byte aligned. */
12184 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
12185 if (cum
->aapcs_stack_size
> 0)
12186 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
12187 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
12188 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12190 /* Emit code to initialize GRTOP, the top of the GR save area.
12191 virtual_incoming_args_rtx should have been 16 byte aligned. */
12192 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
12193 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
12194 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12196 /* Emit code to initialize VRTOP, the top of the VR save area.
12197 This address is gr_save_area_bytes below GRTOP, rounded
12198 down to the next 16-byte boundary. */
12199 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
12200 vr_offset
= ROUND_UP (gr_save_area_size
,
12201 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12204 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
12205 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
12206 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12208 /* Emit code to initialize GROFF, the offset from GRTOP of the
12209 next GPR argument. */
12210 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
12211 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
12212 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12214 /* Likewise emit code to initialize VROFF, the offset from FTOP
12215 of the next VR argument. */
12216 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
12217 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
12218 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12221 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12224 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
12225 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
12229 bool is_ha
; /* is HFA or HVA. */
12230 bool dw_align
; /* double-word align. */
12231 machine_mode ag_mode
= VOIDmode
;
12235 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12236 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
12237 HOST_WIDE_INT size
, rsize
, adjust
, align
;
12238 tree t
, u
, cond1
, cond2
;
12240 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
12242 type
= build_pointer_type (type
);
12244 mode
= TYPE_MODE (type
);
12246 f_stack
= TYPE_FIELDS (va_list_type_node
);
12247 f_grtop
= DECL_CHAIN (f_stack
);
12248 f_vrtop
= DECL_CHAIN (f_grtop
);
12249 f_groff
= DECL_CHAIN (f_vrtop
);
12250 f_vroff
= DECL_CHAIN (f_groff
);
12252 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
12253 f_stack
, NULL_TREE
);
12254 size
= int_size_in_bytes (type
);
12255 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
12259 if (aarch64_vfp_is_call_or_return_candidate (mode
,
12265 /* No frontends can create types with variable-sized modes, so we
12266 shouldn't be asked to pass or return them. */
12267 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
12269 /* TYPE passed in fp/simd registers. */
12271 aarch64_err_no_fpadvsimd (mode
);
12273 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
12274 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
12275 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
12276 unshare_expr (valist
), f_vroff
, NULL_TREE
);
12278 rsize
= nregs
* UNITS_PER_VREG
;
12282 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
12283 adjust
= UNITS_PER_VREG
- ag_size
;
12285 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12286 && size
< UNITS_PER_VREG
)
12288 adjust
= UNITS_PER_VREG
- size
;
12293 /* TYPE passed in general registers. */
12294 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
12295 unshare_expr (valist
), f_grtop
, NULL_TREE
);
12296 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
12297 unshare_expr (valist
), f_groff
, NULL_TREE
);
12298 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
12299 nregs
= rsize
/ UNITS_PER_WORD
;
12304 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12305 && size
< UNITS_PER_WORD
)
12307 adjust
= UNITS_PER_WORD
- size
;
12311 /* Get a local temporary for the field value. */
12312 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
12314 /* Emit code to branch if off >= 0. */
12315 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
12316 build_int_cst (TREE_TYPE (off
), 0));
12317 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
12321 /* Emit: offs = (offs + 15) & -16. */
12322 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12323 build_int_cst (TREE_TYPE (off
), 15));
12324 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
12325 build_int_cst (TREE_TYPE (off
), -16));
12326 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
12331 /* Update ap.__[g|v]r_offs */
12332 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12333 build_int_cst (TREE_TYPE (off
), rsize
));
12334 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
12338 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12340 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12341 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
12342 build_int_cst (TREE_TYPE (f_off
), 0));
12343 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
12345 /* String up: make sure the assignment happens before the use. */
12346 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12347 COND_EXPR_ELSE (cond1
) = t
;
12349 /* Prepare the trees handling the argument that is passed on the stack;
12350 the top level node will store in ON_STACK. */
12351 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12354 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12355 t
= fold_build_pointer_plus_hwi (arg
, 15);
12356 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12357 build_int_cst (TREE_TYPE (t
), -16));
12358 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12362 /* Advance ap.__stack */
12363 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
12364 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12365 build_int_cst (TREE_TYPE (t
), -8));
12366 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12367 /* String up roundup and advance. */
12369 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12370 /* String up with arg */
12371 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12372 /* Big-endianness related address adjustment. */
12373 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12374 && size
< UNITS_PER_WORD
)
12376 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12377 size_int (UNITS_PER_WORD
- size
));
12378 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12381 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12382 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12384 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12387 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12388 build_int_cst (TREE_TYPE (off
), adjust
));
12390 t
= fold_convert (sizetype
, t
);
12391 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12395 /* type ha; // treat as "struct {ftype field[n];}"
12396 ... [computing offs]
12397 for (i = 0; i <nregs; ++i, offs += 16)
12398 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12401 tree tmp_ha
, field_t
, field_ptr_t
;
12403 /* Declare a local variable. */
12404 tmp_ha
= create_tmp_var_raw (type
, "ha");
12405 gimple_add_tmp_var (tmp_ha
);
12407 /* Establish the base type. */
12411 field_t
= float_type_node
;
12412 field_ptr_t
= float_ptr_type_node
;
12415 field_t
= double_type_node
;
12416 field_ptr_t
= double_ptr_type_node
;
12419 field_t
= long_double_type_node
;
12420 field_ptr_t
= long_double_ptr_type_node
;
12423 field_t
= aarch64_fp16_type_node
;
12424 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12429 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12430 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12431 field_ptr_t
= build_pointer_type (field_t
);
12438 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12439 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12441 t
= fold_convert (field_ptr_t
, addr
);
12442 t
= build2 (MODIFY_EXPR
, field_t
,
12443 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12444 build1 (INDIRECT_REF
, field_t
, t
));
12446 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12447 for (i
= 1; i
< nregs
; ++i
)
12449 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12450 u
= fold_convert (field_ptr_t
, addr
);
12451 u
= build2 (MODIFY_EXPR
, field_t
,
12452 build2 (MEM_REF
, field_t
, tmp_ha
,
12453 build_int_cst (field_ptr_t
,
12455 int_size_in_bytes (field_t
)))),
12456 build1 (INDIRECT_REF
, field_t
, u
));
12457 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12460 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12461 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12464 COND_EXPR_ELSE (cond2
) = t
;
12465 addr
= fold_convert (build_pointer_type (type
), cond1
);
12466 addr
= build_va_arg_indirect_ref (addr
);
12469 addr
= build_va_arg_indirect_ref (addr
);
12474 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12477 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12478 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12481 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12482 CUMULATIVE_ARGS local_cum
;
12483 int gr_saved
= cfun
->va_list_gpr_size
;
12484 int vr_saved
= cfun
->va_list_fpr_size
;
12486 /* The caller has advanced CUM up to, but not beyond, the last named
12487 argument. Advance a local copy of CUM past the last "real" named
12488 argument, to find out how many registers are left over. */
12490 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12492 /* Found out how many registers we need to save.
12493 Honor tree-stdvar analysis results. */
12494 if (cfun
->va_list_gpr_size
)
12495 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12496 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12497 if (cfun
->va_list_fpr_size
)
12498 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12499 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12503 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12513 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12514 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12515 - gr_saved
* UNITS_PER_WORD
);
12516 mem
= gen_frame_mem (BLKmode
, ptr
);
12517 set_mem_alias_set (mem
, get_varargs_alias_set ());
12519 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12524 /* We can't use move_block_from_reg, because it will use
12525 the wrong mode, storing D regs only. */
12526 machine_mode mode
= TImode
;
12527 int off
, i
, vr_start
;
12529 /* Set OFF to the offset from virtual_incoming_args_rtx of
12530 the first vector register. The VR save area lies below
12531 the GR one, and is aligned to 16 bytes. */
12532 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12533 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12534 off
-= vr_saved
* UNITS_PER_VREG
;
12536 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12537 for (i
= 0; i
< vr_saved
; ++i
)
12541 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12542 mem
= gen_frame_mem (mode
, ptr
);
12543 set_mem_alias_set (mem
, get_varargs_alias_set ());
12544 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12545 off
+= UNITS_PER_VREG
;
12550 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12551 any complication of having crtl->args.pretend_args_size changed. */
12552 cfun
->machine
->frame
.saved_varargs_size
12553 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12554 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12555 + vr_saved
* UNITS_PER_VREG
);
12559 aarch64_conditional_register_usage (void)
12564 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12567 call_used_regs
[i
] = 1;
12571 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12574 call_used_regs
[i
] = 1;
12578 /* Walk down the type tree of TYPE counting consecutive base elements.
12579 If *MODEP is VOIDmode, then set it to the first valid floating point
12580 type. If a non-floating point type is found, or if a floating point
12581 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12582 otherwise return the count in the sub-tree. */
12584 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12587 HOST_WIDE_INT size
;
12589 switch (TREE_CODE (type
))
12592 mode
= TYPE_MODE (type
);
12593 if (mode
!= DFmode
&& mode
!= SFmode
12594 && mode
!= TFmode
&& mode
!= HFmode
)
12597 if (*modep
== VOIDmode
)
12600 if (*modep
== mode
)
12606 mode
= TYPE_MODE (TREE_TYPE (type
));
12607 if (mode
!= DFmode
&& mode
!= SFmode
12608 && mode
!= TFmode
&& mode
!= HFmode
)
12611 if (*modep
== VOIDmode
)
12614 if (*modep
== mode
)
12620 /* Use V2SImode and V4SImode as representatives of all 64-bit
12621 and 128-bit vector types. */
12622 size
= int_size_in_bytes (type
);
12635 if (*modep
== VOIDmode
)
12638 /* Vector modes are considered to be opaque: two vectors are
12639 equivalent for the purposes of being homogeneous aggregates
12640 if they are the same size. */
12641 if (*modep
== mode
)
12649 tree index
= TYPE_DOMAIN (type
);
12651 /* Can't handle incomplete types nor sizes that are not
12653 if (!COMPLETE_TYPE_P (type
)
12654 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12657 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12660 || !TYPE_MAX_VALUE (index
)
12661 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12662 || !TYPE_MIN_VALUE (index
)
12663 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12667 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12668 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12670 /* There must be no padding. */
12671 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12672 count
* GET_MODE_BITSIZE (*modep
)))
12684 /* Can't handle incomplete types nor sizes that are not
12686 if (!COMPLETE_TYPE_P (type
)
12687 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12690 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12692 if (TREE_CODE (field
) != FIELD_DECL
)
12695 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12698 count
+= sub_count
;
12701 /* There must be no padding. */
12702 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12703 count
* GET_MODE_BITSIZE (*modep
)))
12710 case QUAL_UNION_TYPE
:
12712 /* These aren't very interesting except in a degenerate case. */
12717 /* Can't handle incomplete types nor sizes that are not
12719 if (!COMPLETE_TYPE_P (type
)
12720 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12723 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12725 if (TREE_CODE (field
) != FIELD_DECL
)
12728 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12731 count
= count
> sub_count
? count
: sub_count
;
12734 /* There must be no padding. */
12735 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12736 count
* GET_MODE_BITSIZE (*modep
)))
12749 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12750 type as described in AAPCS64 \S 4.1.2.
12752 See the comment above aarch64_composite_type_p for the notes on MODE. */
12755 aarch64_short_vector_p (const_tree type
,
12758 poly_int64 size
= -1;
12760 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12761 size
= int_size_in_bytes (type
);
12762 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12763 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12764 size
= GET_MODE_SIZE (mode
);
12766 return known_eq (size
, 8) || known_eq (size
, 16);
12769 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12770 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12771 array types. The C99 floating-point complex types are also considered
12772 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12773 types, which are GCC extensions and out of the scope of AAPCS64, are
12774 treated as composite types here as well.
12776 Note that MODE itself is not sufficient in determining whether a type
12777 is such a composite type or not. This is because
12778 stor-layout.c:compute_record_mode may have already changed the MODE
12779 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12780 structure with only one field may have its MODE set to the mode of the
12781 field. Also an integer mode whose size matches the size of the
12782 RECORD_TYPE type may be used to substitute the original mode
12783 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12784 solely relied on. */
12787 aarch64_composite_type_p (const_tree type
,
12790 if (aarch64_short_vector_p (type
, mode
))
12793 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12796 if (mode
== BLKmode
12797 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12798 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12804 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12805 shall be passed or returned in simd/fp register(s) (providing these
12806 parameter passing registers are available).
12808 Upon successful return, *COUNT returns the number of needed registers,
12809 *BASE_MODE returns the mode of the individual register and when IS_HAF
12810 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12811 floating-point aggregate or a homogeneous short-vector aggregate. */
12814 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12816 machine_mode
*base_mode
,
12820 machine_mode new_mode
= VOIDmode
;
12821 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12823 if (is_ha
!= NULL
) *is_ha
= false;
12825 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12826 || aarch64_short_vector_p (type
, mode
))
12831 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12833 if (is_ha
!= NULL
) *is_ha
= true;
12835 new_mode
= GET_MODE_INNER (mode
);
12837 else if (type
&& composite_p
)
12839 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12841 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12843 if (is_ha
!= NULL
) *is_ha
= true;
12852 *base_mode
= new_mode
;
12856 /* Implement TARGET_STRUCT_VALUE_RTX. */
12859 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12860 int incoming ATTRIBUTE_UNUSED
)
12862 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12865 /* Implements target hook vector_mode_supported_p. */
12867 aarch64_vector_mode_supported_p (machine_mode mode
)
12869 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12870 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12873 /* Return appropriate SIMD container
12874 for MODE within a vector of WIDTH bits. */
12875 static machine_mode
12876 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12878 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12894 return VNx16QImode
;
12899 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12902 if (known_eq (width
, 128))
12942 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12943 static machine_mode
12944 aarch64_preferred_simd_mode (scalar_mode mode
)
12946 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12947 return aarch64_simd_container_mode (mode
, bits
);
12950 /* Return a list of possible vector sizes for the vectorizer
12951 to iterate over. */
12953 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12956 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12957 sizes
->safe_push (16);
12958 sizes
->safe_push (8);
12961 /* Implement TARGET_MANGLE_TYPE. */
12963 static const char *
12964 aarch64_mangle_type (const_tree type
)
12966 /* The AArch64 ABI documents say that "__va_list" has to be
12967 managled as if it is in the "std" namespace. */
12968 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12969 return "St9__va_list";
12971 /* Half-precision float. */
12972 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12975 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12977 if (TYPE_NAME (type
) != NULL
)
12978 return aarch64_mangle_builtin_type (type
);
12980 /* Use the default mangling. */
12984 /* Find the first rtx_insn before insn that will generate an assembly
12988 aarch64_prev_real_insn (rtx_insn
*insn
)
12995 insn
= prev_real_insn (insn
);
12997 while (insn
&& recog_memoized (insn
) < 0);
13003 is_madd_op (enum attr_type t1
)
13006 /* A number of these may be AArch32 only. */
13007 enum attr_type mlatypes
[] = {
13008 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
13009 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
13010 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
13013 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
13015 if (t1
== mlatypes
[i
])
13022 /* Check if there is a register dependency between a load and the insn
13023 for which we hold recog_data. */
13026 dep_between_memop_and_curr (rtx memop
)
13031 gcc_assert (GET_CODE (memop
) == SET
);
13033 if (!REG_P (SET_DEST (memop
)))
13036 load_reg
= SET_DEST (memop
);
13037 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
13039 rtx operand
= recog_data
.operand
[opno
];
13040 if (REG_P (operand
)
13041 && reg_overlap_mentioned_p (load_reg
, operand
))
13049 /* When working around the Cortex-A53 erratum 835769,
13050 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13051 instruction and has a preceding memory instruction such that a NOP
13052 should be inserted between them. */
13055 aarch64_madd_needs_nop (rtx_insn
* insn
)
13057 enum attr_type attr_type
;
13061 if (!TARGET_FIX_ERR_A53_835769
)
13064 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
13067 attr_type
= get_attr_type (insn
);
13068 if (!is_madd_op (attr_type
))
13071 prev
= aarch64_prev_real_insn (insn
);
13072 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13073 Restore recog state to INSN to avoid state corruption. */
13074 extract_constrain_insn_cached (insn
);
13076 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
13079 body
= single_set (prev
);
13081 /* If the previous insn is a memory op and there is no dependency between
13082 it and the DImode madd, emit a NOP between them. If body is NULL then we
13083 have a complex memory operation, probably a load/store pair.
13084 Be conservative for now and emit a NOP. */
13085 if (GET_MODE (recog_data
.operand
[0]) == DImode
13086 && (!body
|| !dep_between_memop_and_curr (body
)))
13094 /* Implement FINAL_PRESCAN_INSN. */
13097 aarch64_final_prescan_insn (rtx_insn
*insn
)
13099 if (aarch64_madd_needs_nop (insn
))
13100 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
13104 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13108 aarch64_sve_index_immediate_p (rtx base_or_step
)
13110 return (CONST_INT_P (base_or_step
)
13111 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
13114 /* Return true if X is a valid immediate for the SVE ADD and SUB
13115 instructions. Negate X first if NEGATE_P is true. */
13118 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
13122 if (!const_vec_duplicate_p (x
, &elt
)
13123 || !CONST_INT_P (elt
))
13126 HOST_WIDE_INT val
= INTVAL (elt
);
13129 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
13132 return IN_RANGE (val
, 0, 0xff);
13133 return IN_RANGE (val
, 0, 0xff00);
13136 /* Return true if X is a valid immediate operand for an SVE logical
13137 instruction such as AND. */
13140 aarch64_sve_bitmask_immediate_p (rtx x
)
13144 return (const_vec_duplicate_p (x
, &elt
)
13145 && CONST_INT_P (elt
)
13146 && aarch64_bitmask_imm (INTVAL (elt
),
13147 GET_MODE_INNER (GET_MODE (x
))));
13150 /* Return true if X is a valid immediate for the SVE DUP and CPY
13154 aarch64_sve_dup_immediate_p (rtx x
)
13158 if (!const_vec_duplicate_p (x
, &elt
)
13159 || !CONST_INT_P (elt
))
13162 HOST_WIDE_INT val
= INTVAL (elt
);
13164 return IN_RANGE (val
, -0x80, 0x7f);
13165 return IN_RANGE (val
, -0x8000, 0x7f00);
13168 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13169 SIGNED_P says whether the operand is signed rather than unsigned. */
13172 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
13176 return (const_vec_duplicate_p (x
, &elt
)
13177 && CONST_INT_P (elt
)
13179 ? IN_RANGE (INTVAL (elt
), -16, 15)
13180 : IN_RANGE (INTVAL (elt
), 0, 127)));
13183 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13184 instruction. Negate X first if NEGATE_P is true. */
13187 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
13192 if (!const_vec_duplicate_p (x
, &elt
)
13193 || GET_CODE (elt
) != CONST_DOUBLE
)
13196 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
13199 r
= real_value_negate (&r
);
13201 if (real_equal (&r
, &dconst1
))
13203 if (real_equal (&r
, &dconsthalf
))
13208 /* Return true if X is a valid immediate operand for an SVE FMUL
13212 aarch64_sve_float_mul_immediate_p (rtx x
)
13216 /* GCC will never generate a multiply with an immediate of 2, so there is no
13217 point testing for it (even though it is a valid constant). */
13218 return (const_vec_duplicate_p (x
, &elt
)
13219 && GET_CODE (elt
) == CONST_DOUBLE
13220 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
13223 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13224 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13225 is nonnull, use it to describe valid immediates. */
13227 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
13228 simd_immediate_info
*info
,
13229 enum simd_immediate_check which
,
13230 simd_immediate_info::insn_type insn
)
13232 /* Try a 4-byte immediate with LSL. */
13233 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
13234 if ((val32
& (0xff << shift
)) == val32
)
13237 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13238 simd_immediate_info::LSL
, shift
);
13242 /* Try a 2-byte immediate with LSL. */
13243 unsigned int imm16
= val32
& 0xffff;
13244 if (imm16
== (val32
>> 16))
13245 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
13246 if ((imm16
& (0xff << shift
)) == imm16
)
13249 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
13250 simd_immediate_info::LSL
, shift
);
13254 /* Try a 4-byte immediate with MSL, except for cases that MVN
13256 if (which
== AARCH64_CHECK_MOV
)
13257 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
13259 unsigned int low
= (1 << shift
) - 1;
13260 if (((val32
& (0xff << shift
)) | low
) == val32
)
13263 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13264 simd_immediate_info::MSL
, shift
);
13272 /* Return true if replicating VAL64 is a valid immediate for the
13273 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13274 use it to describe valid immediates. */
13276 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
13277 simd_immediate_info
*info
,
13278 enum simd_immediate_check which
)
13280 unsigned int val32
= val64
& 0xffffffff;
13281 unsigned int val16
= val64
& 0xffff;
13282 unsigned int val8
= val64
& 0xff;
13284 if (val32
== (val64
>> 32))
13286 if ((which
& AARCH64_CHECK_ORR
) != 0
13287 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
13288 simd_immediate_info::MOV
))
13291 if ((which
& AARCH64_CHECK_BIC
) != 0
13292 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
13293 simd_immediate_info::MVN
))
13296 /* Try using a replicated byte. */
13297 if (which
== AARCH64_CHECK_MOV
13298 && val16
== (val32
>> 16)
13299 && val8
== (val16
>> 8))
13302 *info
= simd_immediate_info (QImode
, val8
);
13307 /* Try using a bit-to-bytemask. */
13308 if (which
== AARCH64_CHECK_MOV
)
13311 for (i
= 0; i
< 64; i
+= 8)
13313 unsigned char byte
= (val64
>> i
) & 0xff;
13314 if (byte
!= 0 && byte
!= 0xff)
13320 *info
= simd_immediate_info (DImode
, val64
);
13327 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13328 instruction. If INFO is nonnull, use it to describe valid immediates. */
13331 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13332 simd_immediate_info
*info
)
13334 scalar_int_mode mode
= DImode
;
13335 unsigned int val32
= val64
& 0xffffffff;
13336 if (val32
== (val64
>> 32))
13339 unsigned int val16
= val32
& 0xffff;
13340 if (val16
== (val32
>> 16))
13343 unsigned int val8
= val16
& 0xff;
13344 if (val8
== (val16
>> 8))
13348 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13349 if (IN_RANGE (val
, -0x80, 0x7f))
13351 /* DUP with no shift. */
13353 *info
= simd_immediate_info (mode
, val
);
13356 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13358 /* DUP with LSL #8. */
13360 *info
= simd_immediate_info (mode
, val
);
13363 if (aarch64_bitmask_imm (val64
, mode
))
13367 *info
= simd_immediate_info (mode
, val
);
13373 /* Return true if OP is a valid SIMD immediate for the operation
13374 described by WHICH. If INFO is nonnull, use it to describe valid
13377 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13378 enum simd_immediate_check which
)
13380 machine_mode mode
= GET_MODE (op
);
13381 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13382 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13385 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13387 unsigned int n_elts
;
13388 if (GET_CODE (op
) == CONST_VECTOR
13389 && CONST_VECTOR_DUPLICATE_P (op
))
13390 n_elts
= CONST_VECTOR_NPATTERNS (op
);
13391 else if ((vec_flags
& VEC_SVE_DATA
)
13392 && const_vec_series_p (op
, &base
, &step
))
13394 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13395 if (!aarch64_sve_index_immediate_p (base
)
13396 || !aarch64_sve_index_immediate_p (step
))
13400 *info
= simd_immediate_info (elt_mode
, base
, step
);
13403 else if (GET_CODE (op
) == CONST_VECTOR
13404 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13405 /* N_ELTS set above. */;
13409 /* Handle PFALSE and PTRUE. */
13410 if (vec_flags
& VEC_SVE_PRED
)
13411 return (op
== CONST0_RTX (mode
)
13412 || op
== CONSTM1_RTX (mode
));
13414 scalar_float_mode elt_float_mode
;
13416 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
13418 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
13419 if (aarch64_float_const_zero_rtx_p (elt
)
13420 || aarch64_float_const_representable_p (elt
))
13423 *info
= simd_immediate_info (elt_float_mode
, elt
);
13428 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13432 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13434 /* Expand the vector constant out into a byte vector, with the least
13435 significant byte of the register first. */
13436 auto_vec
<unsigned char, 16> bytes
;
13437 bytes
.reserve (n_elts
* elt_size
);
13438 for (unsigned int i
= 0; i
< n_elts
; i
++)
13440 /* The vector is provided in gcc endian-neutral fashion.
13441 For aarch64_be Advanced SIMD, it must be laid out in the vector
13442 register in reverse order. */
13443 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
13444 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
13446 if (elt_mode
!= elt_int_mode
)
13447 elt
= gen_lowpart (elt_int_mode
, elt
);
13449 if (!CONST_INT_P (elt
))
13452 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13453 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13455 bytes
.quick_push (elt_val
& 0xff);
13456 elt_val
>>= BITS_PER_UNIT
;
13460 /* The immediate must repeat every eight bytes. */
13461 unsigned int nbytes
= bytes
.length ();
13462 for (unsigned i
= 8; i
< nbytes
; ++i
)
13463 if (bytes
[i
] != bytes
[i
- 8])
13466 /* Get the repeating 8-byte value as an integer. No endian correction
13467 is needed here because bytes is already in lsb-first order. */
13468 unsigned HOST_WIDE_INT val64
= 0;
13469 for (unsigned int i
= 0; i
< 8; i
++)
13470 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13471 << (i
* BITS_PER_UNIT
));
13473 if (vec_flags
& VEC_SVE_DATA
)
13474 return aarch64_sve_valid_immediate (val64
, info
);
13476 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13479 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13480 has a step in the range of INDEX. Return the index expression if so,
13481 otherwise return null. */
13483 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13486 if (const_vec_series_p (x
, &base
, &step
)
13487 && base
== const0_rtx
13488 && aarch64_sve_index_immediate_p (step
))
13493 /* Check of immediate shift constants are within range. */
13495 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13497 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13499 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13501 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13504 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13505 operation of width WIDTH at bit position POS. */
13508 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13510 gcc_assert (CONST_INT_P (width
));
13511 gcc_assert (CONST_INT_P (pos
));
13513 unsigned HOST_WIDE_INT mask
13514 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13515 return GEN_INT (mask
<< UINTVAL (pos
));
13519 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13521 if (GET_CODE (x
) == HIGH
13522 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13525 if (CONST_INT_P (x
))
13528 if (VECTOR_MODE_P (GET_MODE (x
)))
13529 return aarch64_simd_valid_immediate (x
, NULL
);
13531 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13534 if (aarch64_sve_cnt_immediate_p (x
))
13537 return aarch64_classify_symbolic_expression (x
)
13538 == SYMBOL_TINY_ABSOLUTE
;
13541 /* Return a const_int vector of VAL. */
13543 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13545 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13546 return gen_const_vec_duplicate (mode
, c
);
13549 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13552 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13554 machine_mode vmode
;
13556 vmode
= aarch64_simd_container_mode (mode
, 64);
13557 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13558 return aarch64_simd_valid_immediate (op_v
, NULL
);
13561 /* Construct and return a PARALLEL RTX vector with elements numbering the
13562 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13563 the vector - from the perspective of the architecture. This does not
13564 line up with GCC's perspective on lane numbers, so we end up with
13565 different masks depending on our target endian-ness. The diagram
13566 below may help. We must draw the distinction when building masks
13567 which select one half of the vector. An instruction selecting
13568 architectural low-lanes for a big-endian target, must be described using
13569 a mask selecting GCC high-lanes.
13571 Big-Endian Little-Endian
13573 GCC 0 1 2 3 3 2 1 0
13574 | x | x | x | x | | x | x | x | x |
13575 Architecture 3 2 1 0 3 2 1 0
13577 Low Mask: { 2, 3 } { 0, 1 }
13578 High Mask: { 0, 1 } { 2, 3 }
13580 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13583 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13585 rtvec v
= rtvec_alloc (nunits
/ 2);
13586 int high_base
= nunits
/ 2;
13592 if (BYTES_BIG_ENDIAN
)
13593 base
= high
? low_base
: high_base
;
13595 base
= high
? high_base
: low_base
;
13597 for (i
= 0; i
< nunits
/ 2; i
++)
13598 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13600 t1
= gen_rtx_PARALLEL (mode
, v
);
13604 /* Check OP for validity as a PARALLEL RTX vector with elements
13605 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13606 from the perspective of the architecture. See the diagram above
13607 aarch64_simd_vect_par_cnst_half for more details. */
13610 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13614 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13617 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13618 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13619 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13622 if (count_op
!= count_ideal
)
13625 for (i
= 0; i
< count_ideal
; i
++)
13627 rtx elt_op
= XVECEXP (op
, 0, i
);
13628 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13630 if (!CONST_INT_P (elt_op
)
13631 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13637 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13638 HIGH (exclusive). */
13640 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13643 HOST_WIDE_INT lane
;
13644 gcc_assert (CONST_INT_P (operand
));
13645 lane
= INTVAL (operand
);
13647 if (lane
< low
|| lane
>= high
)
13650 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13652 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13656 /* Peform endian correction on lane number N, which indexes a vector
13657 of mode MODE, and return the result as an SImode rtx. */
13660 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13662 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13665 /* Return TRUE if OP is a valid vector addressing mode. */
13668 aarch64_simd_mem_operand_p (rtx op
)
13670 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13671 || REG_P (XEXP (op
, 0)));
13674 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13677 aarch64_sve_ld1r_operand_p (rtx op
)
13679 struct aarch64_address_info addr
;
13683 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13684 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13685 && addr
.type
== ADDRESS_REG_IMM
13686 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13689 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13690 The conditions for STR are the same. */
13692 aarch64_sve_ldr_operand_p (rtx op
)
13694 struct aarch64_address_info addr
;
13697 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13698 false, ADDR_QUERY_ANY
)
13699 && addr
.type
== ADDRESS_REG_IMM
);
13702 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13703 We need to be able to access the individual pieces, so the range
13704 is different from LD[234] and ST[234]. */
13706 aarch64_sve_struct_memory_operand_p (rtx op
)
13711 machine_mode mode
= GET_MODE (op
);
13712 struct aarch64_address_info addr
;
13713 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
13715 || addr
.type
!= ADDRESS_REG_IMM
)
13718 poly_int64 first
= addr
.const_offset
;
13719 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
13720 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
13721 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
13724 /* Emit a register copy from operand to operand, taking care not to
13725 early-clobber source registers in the process.
13727 COUNT is the number of components into which the copy needs to be
13730 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13731 unsigned int count
)
13734 int rdest
= REGNO (operands
[0]);
13735 int rsrc
= REGNO (operands
[1]);
13737 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13739 for (i
= 0; i
< count
; i
++)
13740 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13741 gen_rtx_REG (mode
, rsrc
+ i
));
13743 for (i
= 0; i
< count
; i
++)
13744 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13745 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13748 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13749 one of VSTRUCT modes: OI, CI, or XI. */
13751 aarch64_simd_attr_length_rglist (machine_mode mode
)
13753 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13754 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13757 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13758 alignment of a vector to 128 bits. SVE predicates have an alignment of
13760 static HOST_WIDE_INT
13761 aarch64_simd_vector_alignment (const_tree type
)
13763 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13764 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13765 be set for non-predicate vectors of booleans. Modes are the most
13766 direct way we have of identifying real SVE predicate types. */
13767 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13768 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13769 return MIN (align
, 128);
13772 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13773 static HOST_WIDE_INT
13774 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13776 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13778 /* If the length of the vector is fixed, try to align to that length,
13779 otherwise don't try to align at all. */
13780 HOST_WIDE_INT result
;
13781 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13782 result
= TYPE_ALIGN (TREE_TYPE (type
));
13785 return TYPE_ALIGN (type
);
13788 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13790 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13795 /* For fixed-length vectors, check that the vectorizer will aim for
13796 full-vector alignment. This isn't true for generic GCC vectors
13797 that are wider than the ABI maximum of 128 bits. */
13798 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13799 && (wi::to_widest (TYPE_SIZE (type
))
13800 != aarch64_vectorize_preferred_vector_alignment (type
)))
13803 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13807 /* Return true if the vector misalignment factor is supported by the
13810 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13811 const_tree type
, int misalignment
,
13814 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13816 /* Return if movmisalign pattern is not supported for this mode. */
13817 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13820 /* Misalignment factor is unknown at compile time. */
13821 if (misalignment
== -1)
13824 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13828 /* If VALS is a vector constant that can be loaded into a register
13829 using DUP, generate instructions to do so and return an RTX to
13830 assign to the register. Otherwise return NULL_RTX. */
13832 aarch64_simd_dup_constant (rtx vals
)
13834 machine_mode mode
= GET_MODE (vals
);
13835 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13838 if (!const_vec_duplicate_p (vals
, &x
))
13841 /* We can load this constant by using DUP and a constant in a
13842 single ARM register. This will be cheaper than a vector
13844 x
= copy_to_mode_reg (inner_mode
, x
);
13845 return gen_vec_duplicate (mode
, x
);
13849 /* Generate code to load VALS, which is a PARALLEL containing only
13850 constants (for vec_init) or CONST_VECTOR, efficiently into a
13851 register. Returns an RTX to copy into the register, or NULL_RTX
13852 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13854 aarch64_simd_make_constant (rtx vals
)
13856 machine_mode mode
= GET_MODE (vals
);
13858 rtx const_vec
= NULL_RTX
;
13862 if (GET_CODE (vals
) == CONST_VECTOR
)
13864 else if (GET_CODE (vals
) == PARALLEL
)
13866 /* A CONST_VECTOR must contain only CONST_INTs and
13867 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13868 Only store valid constants in a CONST_VECTOR. */
13869 int n_elts
= XVECLEN (vals
, 0);
13870 for (i
= 0; i
< n_elts
; ++i
)
13872 rtx x
= XVECEXP (vals
, 0, i
);
13873 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13876 if (n_const
== n_elts
)
13877 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13880 gcc_unreachable ();
13882 if (const_vec
!= NULL_RTX
13883 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13884 /* Load using MOVI/MVNI. */
13886 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13887 /* Loaded using DUP. */
13889 else if (const_vec
!= NULL_RTX
)
13890 /* Load from constant pool. We can not take advantage of single-cycle
13891 LD1 because we need a PC-relative addressing mode. */
13894 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13895 We can not construct an initializer. */
13899 /* Expand a vector initialisation sequence, such that TARGET is
13900 initialised to contain VALS. */
13903 aarch64_expand_vector_init (rtx target
, rtx vals
)
13905 machine_mode mode
= GET_MODE (target
);
13906 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13907 /* The number of vector elements. */
13908 int n_elts
= XVECLEN (vals
, 0);
13909 /* The number of vector elements which are not constant. */
13911 rtx any_const
= NULL_RTX
;
13912 /* The first element of vals. */
13913 rtx v0
= XVECEXP (vals
, 0, 0);
13914 bool all_same
= true;
13916 /* Count the number of variable elements to initialise. */
13917 for (int i
= 0; i
< n_elts
; ++i
)
13919 rtx x
= XVECEXP (vals
, 0, i
);
13920 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13925 all_same
&= rtx_equal_p (x
, v0
);
13928 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13929 how best to handle this. */
13932 rtx constant
= aarch64_simd_make_constant (vals
);
13933 if (constant
!= NULL_RTX
)
13935 emit_move_insn (target
, constant
);
13940 /* Splat a single non-constant element if we can. */
13943 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13944 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13948 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13949 gcc_assert (icode
!= CODE_FOR_nothing
);
13951 /* If there are only variable elements, try to optimize
13952 the insertion using dup for the most common element
13953 followed by insertions. */
13955 /* The algorithm will fill matches[*][0] with the earliest matching element,
13956 and matches[X][1] with the count of duplicate elements (if X is the
13957 earliest element which has duplicates). */
13959 if (n_var
== n_elts
&& n_elts
<= 16)
13961 int matches
[16][2] = {0};
13962 for (int i
= 0; i
< n_elts
; i
++)
13964 for (int j
= 0; j
<= i
; j
++)
13966 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13974 int maxelement
= 0;
13976 for (int i
= 0; i
< n_elts
; i
++)
13977 if (matches
[i
][1] > maxv
)
13980 maxv
= matches
[i
][1];
13983 /* Create a duplicate of the most common element, unless all elements
13984 are equally useless to us, in which case just immediately set the
13985 vector register using the first element. */
13989 /* For vectors of two 64-bit elements, we can do even better. */
13991 && (inner_mode
== E_DImode
13992 || inner_mode
== E_DFmode
))
13995 rtx x0
= XVECEXP (vals
, 0, 0);
13996 rtx x1
= XVECEXP (vals
, 0, 1);
13997 /* Combine can pick up this case, but handling it directly
13998 here leaves clearer RTL.
14000 This is load_pair_lanes<mode>, and also gives us a clean-up
14001 for store_pair_lanes<mode>. */
14002 if (memory_operand (x0
, inner_mode
)
14003 && memory_operand (x1
, inner_mode
)
14004 && !STRICT_ALIGNMENT
14005 && rtx_equal_p (XEXP (x1
, 0),
14006 plus_constant (Pmode
,
14008 GET_MODE_SIZE (inner_mode
))))
14011 if (inner_mode
== DFmode
)
14012 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
14014 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
14019 /* The subreg-move sequence below will move into lane zero of the
14020 vector register. For big-endian we want that position to hold
14021 the last element of VALS. */
14022 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
14023 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14024 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
14028 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
14029 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
14032 /* Insert the rest. */
14033 for (int i
= 0; i
< n_elts
; i
++)
14035 rtx x
= XVECEXP (vals
, 0, i
);
14036 if (matches
[i
][0] == maxelement
)
14038 x
= copy_to_mode_reg (inner_mode
, x
);
14039 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14044 /* Initialise a vector which is part-variable. We want to first try
14045 to build those lanes which are constant in the most efficient way we
14047 if (n_var
!= n_elts
)
14049 rtx copy
= copy_rtx (vals
);
14051 /* Load constant part of vector. We really don't care what goes into the
14052 parts we will overwrite, but we're more likely to be able to load the
14053 constant efficiently if it has fewer, larger, repeating parts
14054 (see aarch64_simd_valid_immediate). */
14055 for (int i
= 0; i
< n_elts
; i
++)
14057 rtx x
= XVECEXP (vals
, 0, i
);
14058 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14060 rtx subst
= any_const
;
14061 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
14063 /* Look in the copied vector, as more elements are const. */
14064 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
14065 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
14071 XVECEXP (copy
, 0, i
) = subst
;
14073 aarch64_expand_vector_init (target
, copy
);
14076 /* Insert the variable lanes directly. */
14077 for (int i
= 0; i
< n_elts
; i
++)
14079 rtx x
= XVECEXP (vals
, 0, i
);
14080 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
14082 x
= copy_to_mode_reg (inner_mode
, x
);
14083 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
14087 static unsigned HOST_WIDE_INT
14088 aarch64_shift_truncation_mask (machine_mode mode
)
14090 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
14092 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
14095 /* Select a format to encode pointers in exception handling data. */
14097 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
14100 switch (aarch64_cmodel
)
14102 case AARCH64_CMODEL_TINY
:
14103 case AARCH64_CMODEL_TINY_PIC
:
14104 case AARCH64_CMODEL_SMALL
:
14105 case AARCH64_CMODEL_SMALL_PIC
:
14106 case AARCH64_CMODEL_SMALL_SPIC
:
14107 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14109 type
= DW_EH_PE_sdata4
;
14112 /* No assumptions here. 8-byte relocs required. */
14113 type
= DW_EH_PE_sdata8
;
14116 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
14119 /* The last .arch and .tune assembly strings that we printed. */
14120 static std::string aarch64_last_printed_arch_string
;
14121 static std::string aarch64_last_printed_tune_string
;
14123 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14124 by the function fndecl. */
14127 aarch64_declare_function_name (FILE *stream
, const char* name
,
14130 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14132 struct cl_target_option
*targ_options
;
14134 targ_options
= TREE_TARGET_OPTION (target_parts
);
14136 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
14137 gcc_assert (targ_options
);
14139 const struct processor
*this_arch
14140 = aarch64_get_arch (targ_options
->x_explicit_arch
);
14142 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
14143 std::string extension
14144 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
14146 /* Only update the assembler .arch string if it is distinct from the last
14147 such string we printed. */
14148 std::string to_print
= this_arch
->name
+ extension
;
14149 if (to_print
!= aarch64_last_printed_arch_string
)
14151 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
14152 aarch64_last_printed_arch_string
= to_print
;
14155 /* Print the cpu name we're tuning for in the comments, might be
14156 useful to readers of the generated asm. Do it only when it changes
14157 from function to function and verbose assembly is requested. */
14158 const struct processor
*this_tune
14159 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
14161 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
14163 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
14165 aarch64_last_printed_tune_string
= this_tune
->name
;
14168 /* Don't forget the type directive for ELF. */
14169 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
14170 ASM_OUTPUT_LABEL (stream
, name
);
14173 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14176 aarch64_start_file (void)
14178 struct cl_target_option
*default_options
14179 = TREE_TARGET_OPTION (target_option_default_node
);
14181 const struct processor
*default_arch
14182 = aarch64_get_arch (default_options
->x_explicit_arch
);
14183 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
14184 std::string extension
14185 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
14186 default_arch
->flags
);
14188 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
14189 aarch64_last_printed_tune_string
= "";
14190 asm_fprintf (asm_out_file
, "\t.arch %s\n",
14191 aarch64_last_printed_arch_string
.c_str ());
14193 default_file_start ();
14196 /* Emit load exclusive. */
14199 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
14200 rtx mem
, rtx model_rtx
)
14202 rtx (*gen
) (rtx
, rtx
, rtx
);
14206 case E_QImode
: gen
= gen_aarch64_load_exclusiveqi
; break;
14207 case E_HImode
: gen
= gen_aarch64_load_exclusivehi
; break;
14208 case E_SImode
: gen
= gen_aarch64_load_exclusivesi
; break;
14209 case E_DImode
: gen
= gen_aarch64_load_exclusivedi
; break;
14211 gcc_unreachable ();
14214 emit_insn (gen (rval
, mem
, model_rtx
));
14217 /* Emit store exclusive. */
14220 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
14221 rtx rval
, rtx mem
, rtx model_rtx
)
14223 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14227 case E_QImode
: gen
= gen_aarch64_store_exclusiveqi
; break;
14228 case E_HImode
: gen
= gen_aarch64_store_exclusivehi
; break;
14229 case E_SImode
: gen
= gen_aarch64_store_exclusivesi
; break;
14230 case E_DImode
: gen
= gen_aarch64_store_exclusivedi
; break;
14232 gcc_unreachable ();
14235 emit_insn (gen (bval
, rval
, mem
, model_rtx
));
14238 /* Mark the previous jump instruction as unlikely. */
14241 aarch64_emit_unlikely_jump (rtx insn
)
14243 rtx_insn
*jump
= emit_jump_insn (insn
);
14244 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
14247 /* Expand a compare and swap pattern. */
14250 aarch64_expand_compare_and_swap (rtx operands
[])
14252 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
14253 machine_mode mode
, cmp_mode
;
14254 typedef rtx (*gen_cas_fn
) (rtx
, rtx
, rtx
, rtx
, rtx
, rtx
, rtx
);
14257 const gen_cas_fn split_cas
[] =
14259 gen_aarch64_compare_and_swapqi
,
14260 gen_aarch64_compare_and_swaphi
,
14261 gen_aarch64_compare_and_swapsi
,
14262 gen_aarch64_compare_and_swapdi
14264 const gen_cas_fn atomic_cas
[] =
14266 gen_aarch64_compare_and_swapqi_lse
,
14267 gen_aarch64_compare_and_swaphi_lse
,
14268 gen_aarch64_compare_and_swapsi_lse
,
14269 gen_aarch64_compare_and_swapdi_lse
14272 bval
= operands
[0];
14273 rval
= operands
[1];
14275 oldval
= operands
[3];
14276 newval
= operands
[4];
14277 is_weak
= operands
[5];
14278 mod_s
= operands
[6];
14279 mod_f
= operands
[7];
14280 mode
= GET_MODE (mem
);
14283 /* Normally the succ memory model must be stronger than fail, but in the
14284 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14285 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14287 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
14288 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
14289 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
14295 /* For short modes, we're going to perform the comparison in SImode,
14296 so do the zero-extension now. */
14298 rval
= gen_reg_rtx (SImode
);
14299 oldval
= convert_modes (SImode
, mode
, oldval
, true);
14300 /* Fall through. */
14304 /* Force the value into a register if needed. */
14305 if (!aarch64_plus_operand (oldval
, mode
))
14306 oldval
= force_reg (cmp_mode
, oldval
);
14310 gcc_unreachable ();
14315 case E_QImode
: idx
= 0; break;
14316 case E_HImode
: idx
= 1; break;
14317 case E_SImode
: idx
= 2; break;
14318 case E_DImode
: idx
= 3; break;
14320 gcc_unreachable ();
14323 gen
= atomic_cas
[idx
];
14325 gen
= split_cas
[idx
];
14327 emit_insn (gen (rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
));
14329 if (mode
== QImode
|| mode
== HImode
)
14330 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
14332 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14333 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
14334 emit_insn (gen_rtx_SET (bval
, x
));
14337 /* Test whether the target supports using a atomic load-operate instruction.
14338 CODE is the operation and AFTER is TRUE if the data in memory after the
14339 operation should be returned and FALSE if the data before the operation
14340 should be returned. Returns FALSE if the operation isn't supported by the
14344 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
14363 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14364 sequence implementing an atomic operation. */
14367 aarch64_emit_post_barrier (enum memmodel model
)
14369 const enum memmodel base_model
= memmodel_base (model
);
14371 if (is_mm_sync (model
)
14372 && (base_model
== MEMMODEL_ACQUIRE
14373 || base_model
== MEMMODEL_ACQ_REL
14374 || base_model
== MEMMODEL_SEQ_CST
))
14376 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
14380 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14381 for the data in memory. EXPECTED is the value expected to be in memory.
14382 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14383 is the memory ordering to use. */
14386 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
14387 rtx expected
, rtx desired
,
14390 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14393 mode
= GET_MODE (mem
);
14397 case E_QImode
: gen
= gen_aarch64_atomic_casqi
; break;
14398 case E_HImode
: gen
= gen_aarch64_atomic_cashi
; break;
14399 case E_SImode
: gen
= gen_aarch64_atomic_cassi
; break;
14400 case E_DImode
: gen
= gen_aarch64_atomic_casdi
; break;
14402 gcc_unreachable ();
14405 /* Move the expected value into the CAS destination register. */
14406 emit_insn (gen_rtx_SET (rval
, expected
));
14408 /* Emit the CAS. */
14409 emit_insn (gen (rval
, mem
, desired
, model
));
14411 /* Compare the expected value with the value loaded by the CAS, to establish
14412 whether the swap was made. */
14413 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14416 /* Split a compare and swap pattern. */
14419 aarch64_split_compare_and_swap (rtx operands
[])
14421 rtx rval
, mem
, oldval
, newval
, scratch
;
14424 rtx_code_label
*label1
, *label2
;
14426 enum memmodel model
;
14429 rval
= operands
[0];
14431 oldval
= operands
[2];
14432 newval
= operands
[3];
14433 is_weak
= (operands
[4] != const0_rtx
);
14434 model_rtx
= operands
[5];
14435 scratch
= operands
[7];
14436 mode
= GET_MODE (mem
);
14437 model
= memmodel_from_int (INTVAL (model_rtx
));
14439 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14442 LD[A]XR rval, [mem]
14444 ST[L]XR scratch, newval, [mem]
14445 CBNZ scratch, .label1
14448 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14453 label1
= gen_label_rtx ();
14454 emit_label (label1
);
14456 label2
= gen_label_rtx ();
14458 /* The initial load can be relaxed for a __sync operation since a final
14459 barrier will be emitted to stop code hoisting. */
14460 if (is_mm_sync (model
))
14461 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14462 GEN_INT (MEMMODEL_RELAXED
));
14464 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14468 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14469 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14470 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14471 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14475 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14476 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14477 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14478 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14479 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14482 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14486 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14487 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14488 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14489 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14493 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14494 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14495 emit_insn (gen_rtx_SET (cond
, x
));
14498 emit_label (label2
);
14499 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14500 to set the condition flags. If this is not used it will be removed by
14504 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14505 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14506 emit_insn (gen_rtx_SET (cond
, x
));
14508 /* Emit any final barrier needed for a __sync operation. */
14509 if (is_mm_sync (model
))
14510 aarch64_emit_post_barrier (model
);
14513 /* Emit a BIC instruction. */
14516 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14518 rtx shift_rtx
= GEN_INT (shift
);
14519 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14523 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14524 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14526 gcc_unreachable ();
14529 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14532 /* Emit an atomic swap. */
14535 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14536 rtx mem
, rtx model
)
14538 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14542 case E_QImode
: gen
= gen_aarch64_atomic_swpqi
; break;
14543 case E_HImode
: gen
= gen_aarch64_atomic_swphi
; break;
14544 case E_SImode
: gen
= gen_aarch64_atomic_swpsi
; break;
14545 case E_DImode
: gen
= gen_aarch64_atomic_swpdi
; break;
14547 gcc_unreachable ();
14550 emit_insn (gen (dst
, mem
, value
, model
));
14553 /* Operations supported by aarch64_emit_atomic_load_op. */
14555 enum aarch64_atomic_load_op_code
14557 AARCH64_LDOP_PLUS
, /* A + B */
14558 AARCH64_LDOP_XOR
, /* A ^ B */
14559 AARCH64_LDOP_OR
, /* A | B */
14560 AARCH64_LDOP_BIC
/* A & ~B */
14563 /* Emit an atomic load-operate. */
14566 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code
,
14567 machine_mode mode
, rtx dst
, rtx src
,
14568 rtx mem
, rtx model
)
14570 typedef rtx (*aarch64_atomic_load_op_fn
) (rtx
, rtx
, rtx
, rtx
);
14571 const aarch64_atomic_load_op_fn plus
[] =
14573 gen_aarch64_atomic_loadaddqi
,
14574 gen_aarch64_atomic_loadaddhi
,
14575 gen_aarch64_atomic_loadaddsi
,
14576 gen_aarch64_atomic_loadadddi
14578 const aarch64_atomic_load_op_fn eor
[] =
14580 gen_aarch64_atomic_loadeorqi
,
14581 gen_aarch64_atomic_loadeorhi
,
14582 gen_aarch64_atomic_loadeorsi
,
14583 gen_aarch64_atomic_loadeordi
14585 const aarch64_atomic_load_op_fn ior
[] =
14587 gen_aarch64_atomic_loadsetqi
,
14588 gen_aarch64_atomic_loadsethi
,
14589 gen_aarch64_atomic_loadsetsi
,
14590 gen_aarch64_atomic_loadsetdi
14592 const aarch64_atomic_load_op_fn bic
[] =
14594 gen_aarch64_atomic_loadclrqi
,
14595 gen_aarch64_atomic_loadclrhi
,
14596 gen_aarch64_atomic_loadclrsi
,
14597 gen_aarch64_atomic_loadclrdi
14599 aarch64_atomic_load_op_fn gen
;
14604 case E_QImode
: idx
= 0; break;
14605 case E_HImode
: idx
= 1; break;
14606 case E_SImode
: idx
= 2; break;
14607 case E_DImode
: idx
= 3; break;
14609 gcc_unreachable ();
14614 case AARCH64_LDOP_PLUS
: gen
= plus
[idx
]; break;
14615 case AARCH64_LDOP_XOR
: gen
= eor
[idx
]; break;
14616 case AARCH64_LDOP_OR
: gen
= ior
[idx
]; break;
14617 case AARCH64_LDOP_BIC
: gen
= bic
[idx
]; break;
14619 gcc_unreachable ();
14622 emit_insn (gen (dst
, mem
, src
, model
));
14625 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14626 location to store the data read from memory. OUT_RESULT is the location to
14627 store the result of the operation. MEM is the memory location to read and
14628 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14629 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14633 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14634 rtx mem
, rtx value
, rtx model_rtx
)
14636 machine_mode mode
= GET_MODE (mem
);
14637 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14638 const bool short_mode
= (mode
< SImode
);
14639 aarch64_atomic_load_op_code ldop_code
;
14644 out_data
= gen_lowpart (mode
, out_data
);
14647 out_result
= gen_lowpart (mode
, out_result
);
14649 /* Make sure the value is in a register, putting it into a destination
14650 register if it needs to be manipulated. */
14651 if (!register_operand (value
, mode
)
14652 || code
== AND
|| code
== MINUS
)
14654 src
= out_result
? out_result
: out_data
;
14655 emit_move_insn (src
, gen_lowpart (mode
, value
));
14659 gcc_assert (register_operand (src
, mode
));
14661 /* Preprocess the data for the operation as necessary. If the operation is
14662 a SET then emit a swap instruction and finish. */
14666 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14670 /* Negate the value and treat it as a PLUS. */
14674 /* Resize the value if necessary. */
14676 src
= gen_lowpart (wmode
, src
);
14678 neg_src
= gen_rtx_NEG (wmode
, src
);
14679 emit_insn (gen_rtx_SET (src
, neg_src
));
14682 src
= gen_lowpart (mode
, src
);
14684 /* Fall-through. */
14686 ldop_code
= AARCH64_LDOP_PLUS
;
14690 ldop_code
= AARCH64_LDOP_OR
;
14694 ldop_code
= AARCH64_LDOP_XOR
;
14701 /* Resize the value if necessary. */
14703 src
= gen_lowpart (wmode
, src
);
14705 not_src
= gen_rtx_NOT (wmode
, src
);
14706 emit_insn (gen_rtx_SET (src
, not_src
));
14709 src
= gen_lowpart (mode
, src
);
14711 ldop_code
= AARCH64_LDOP_BIC
;
14715 /* The operation can't be done with atomic instructions. */
14716 gcc_unreachable ();
14719 aarch64_emit_atomic_load_op (ldop_code
, mode
, out_data
, src
, mem
, model_rtx
);
14721 /* If necessary, calculate the data in memory after the update by redoing the
14722 operation from values in registers. */
14728 src
= gen_lowpart (wmode
, src
);
14729 out_data
= gen_lowpart (wmode
, out_data
);
14730 out_result
= gen_lowpart (wmode
, out_result
);
14739 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14742 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14745 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14748 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14751 gcc_unreachable ();
14754 emit_set_insn (out_result
, x
);
14759 /* Split an atomic operation. */
14762 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14763 rtx value
, rtx model_rtx
, rtx cond
)
14765 machine_mode mode
= GET_MODE (mem
);
14766 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14767 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14768 const bool is_sync
= is_mm_sync (model
);
14769 rtx_code_label
*label
;
14772 /* Split the atomic operation into a sequence. */
14773 label
= gen_label_rtx ();
14774 emit_label (label
);
14777 new_out
= gen_lowpart (wmode
, new_out
);
14779 old_out
= gen_lowpart (wmode
, old_out
);
14782 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14784 /* The initial load can be relaxed for a __sync operation since a final
14785 barrier will be emitted to stop code hoisting. */
14787 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14788 GEN_INT (MEMMODEL_RELAXED
));
14790 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14799 x
= gen_rtx_AND (wmode
, old_out
, value
);
14800 emit_insn (gen_rtx_SET (new_out
, x
));
14801 x
= gen_rtx_NOT (wmode
, new_out
);
14802 emit_insn (gen_rtx_SET (new_out
, x
));
14806 if (CONST_INT_P (value
))
14808 value
= GEN_INT (-INTVAL (value
));
14811 /* Fall through. */
14814 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14815 emit_insn (gen_rtx_SET (new_out
, x
));
14819 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14820 gen_lowpart (mode
, new_out
), model_rtx
);
14822 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14823 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14824 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14825 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14827 /* Emit any final barrier needed for a __sync operation. */
14829 aarch64_emit_post_barrier (model
);
14833 aarch64_init_libfuncs (void)
14835 /* Half-precision float operations. The compiler handles all operations
14836 with NULL libfuncs by converting to SFmode. */
14839 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14840 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14843 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14844 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14845 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14846 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14847 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14850 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14851 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14852 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14853 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14854 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14855 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14856 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14859 /* Target hook for c_mode_for_suffix. */
14860 static machine_mode
14861 aarch64_c_mode_for_suffix (char suffix
)
14869 /* We can only represent floating point constants which will fit in
14870 "quarter-precision" values. These values are characterised by
14871 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14874 (-1)^s * (n/16) * 2^r
14877 's' is the sign bit.
14878 'n' is an integer in the range 16 <= n <= 31.
14879 'r' is an integer in the range -3 <= r <= 4. */
14881 /* Return true iff X can be represented by a quarter-precision
14882 floating point immediate operand X. Note, we cannot represent 0.0. */
14884 aarch64_float_const_representable_p (rtx x
)
14886 /* This represents our current view of how many bits
14887 make up the mantissa. */
14888 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14890 unsigned HOST_WIDE_INT mantissa
, mask
;
14891 REAL_VALUE_TYPE r
, m
;
14894 if (!CONST_DOUBLE_P (x
))
14897 /* We don't support HFmode constants yet. */
14898 if (GET_MODE (x
) == VOIDmode
|| GET_MODE (x
) == HFmode
)
14901 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14903 /* We cannot represent infinities, NaNs or +/-zero. We won't
14904 know if we have +zero until we analyse the mantissa, but we
14905 can reject the other invalid values. */
14906 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14907 || REAL_VALUE_MINUS_ZERO (r
))
14910 /* Extract exponent. */
14911 r
= real_value_abs (&r
);
14912 exponent
= REAL_EXP (&r
);
14914 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14915 highest (sign) bit, with a fixed binary point at bit point_pos.
14916 m1 holds the low part of the mantissa, m2 the high part.
14917 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14918 bits for the mantissa, this can fail (low bits will be lost). */
14919 real_ldexp (&m
, &r
, point_pos
- exponent
);
14920 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14922 /* If the low part of the mantissa has bits set we cannot represent
14924 if (w
.ulow () != 0)
14926 /* We have rejected the lower HOST_WIDE_INT, so update our
14927 understanding of how many bits lie in the mantissa and
14928 look only at the high HOST_WIDE_INT. */
14929 mantissa
= w
.elt (1);
14930 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14932 /* We can only represent values with a mantissa of the form 1.xxxx. */
14933 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14934 if ((mantissa
& mask
) != 0)
14937 /* Having filtered unrepresentable values, we may now remove all
14938 but the highest 5 bits. */
14939 mantissa
>>= point_pos
- 5;
14941 /* We cannot represent the value 0.0, so reject it. This is handled
14946 /* Then, as bit 4 is always set, we can mask it off, leaving
14947 the mantissa in the range [0, 15]. */
14948 mantissa
&= ~(1 << 4);
14949 gcc_assert (mantissa
<= 15);
14951 /* GCC internally does not use IEEE754-like encoding (where normalized
14952 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14953 Our mantissa values are shifted 4 places to the left relative to
14954 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14955 by 5 places to correct for GCC's representation. */
14956 exponent
= 5 - exponent
;
14958 return (exponent
>= 0 && exponent
<= 7);
14961 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14962 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14963 output MOVI/MVNI, ORR or BIC immediate. */
14965 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14966 enum simd_immediate_check which
)
14969 static char templ
[40];
14970 const char *mnemonic
;
14971 const char *shift_op
;
14972 unsigned int lane_count
= 0;
14975 struct simd_immediate_info info
;
14977 /* This will return true to show const_vector is legal for use as either
14978 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14979 It will also update INFO to show how the immediate should be generated.
14980 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14981 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14982 gcc_assert (is_valid
);
14984 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14985 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
14987 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14989 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
14990 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14991 move immediate path. */
14992 if (aarch64_float_const_zero_rtx_p (info
.value
))
14993 info
.value
= GEN_INT (0);
14996 const unsigned int buf_size
= 20;
14997 char float_buf
[buf_size
] = {'\0'};
14998 real_to_decimal_for_mode (float_buf
,
14999 CONST_DOUBLE_REAL_VALUE (info
.value
),
15000 buf_size
, buf_size
, 1, info
.elt_mode
);
15002 if (lane_count
== 1)
15003 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
15005 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
15006 lane_count
, element_char
, float_buf
);
15011 gcc_assert (CONST_INT_P (info
.value
));
15013 if (which
== AARCH64_CHECK_MOV
)
15015 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
15016 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
15017 if (lane_count
== 1)
15018 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
15019 mnemonic
, UINTVAL (info
.value
));
15020 else if (info
.shift
)
15021 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15022 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
15023 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
15025 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
15026 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
15027 element_char
, UINTVAL (info
.value
));
15031 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15032 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
15034 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15035 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
15036 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
15038 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
15039 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
15040 element_char
, UINTVAL (info
.value
));
15046 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
15049 /* If a floating point number was passed and we desire to use it in an
15050 integer mode do the conversion to integer. */
15051 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
15053 unsigned HOST_WIDE_INT ival
;
15054 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
15055 gcc_unreachable ();
15056 immediate
= gen_int_mode (ival
, mode
);
15059 machine_mode vmode
;
15060 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15061 a 128 bit vector mode. */
15062 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
15064 vmode
= aarch64_simd_container_mode (mode
, width
);
15065 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
15066 return aarch64_output_simd_mov_immediate (v_op
, width
);
15069 /* Return the output string to use for moving immediate CONST_VECTOR
15070 into an SVE register. */
15073 aarch64_output_sve_mov_immediate (rtx const_vector
)
15075 static char templ
[40];
15076 struct simd_immediate_info info
;
15079 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
15080 gcc_assert (is_valid
);
15082 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
15086 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
15087 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
15088 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
15092 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
15094 if (aarch64_float_const_zero_rtx_p (info
.value
))
15095 info
.value
= GEN_INT (0);
15098 const int buf_size
= 20;
15099 char float_buf
[buf_size
] = {};
15100 real_to_decimal_for_mode (float_buf
,
15101 CONST_DOUBLE_REAL_VALUE (info
.value
),
15102 buf_size
, buf_size
, 1, info
.elt_mode
);
15104 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
15105 element_char
, float_buf
);
15110 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
15111 element_char
, INTVAL (info
.value
));
15115 /* Return the asm format for a PTRUE instruction whose destination has
15116 mode MODE. SUFFIX is the element size suffix. */
15119 aarch64_output_ptrue (machine_mode mode
, char suffix
)
15121 unsigned int nunits
;
15122 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
15123 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
15124 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
15126 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
15130 /* Split operands into moves from op[1] + op[2] into op[0]. */
15133 aarch64_split_combinev16qi (rtx operands
[3])
15135 unsigned int dest
= REGNO (operands
[0]);
15136 unsigned int src1
= REGNO (operands
[1]);
15137 unsigned int src2
= REGNO (operands
[2]);
15138 machine_mode halfmode
= GET_MODE (operands
[1]);
15139 unsigned int halfregs
= REG_NREGS (operands
[1]);
15140 rtx destlo
, desthi
;
15142 gcc_assert (halfmode
== V16QImode
);
15144 if (src1
== dest
&& src2
== dest
+ halfregs
)
15146 /* No-op move. Can't split to nothing; emit something. */
15147 emit_note (NOTE_INSN_DELETED
);
15151 /* Preserve register attributes for variable tracking. */
15152 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
15153 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
15154 GET_MODE_SIZE (halfmode
));
15156 /* Special case of reversed high/low parts. */
15157 if (reg_overlap_mentioned_p (operands
[2], destlo
)
15158 && reg_overlap_mentioned_p (operands
[1], desthi
))
15160 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15161 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
15162 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
15164 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
15166 /* Try to avoid unnecessary moves if part of the result
15167 is in the right place already. */
15169 emit_move_insn (destlo
, operands
[1]);
15170 if (src2
!= dest
+ halfregs
)
15171 emit_move_insn (desthi
, operands
[2]);
15175 if (src2
!= dest
+ halfregs
)
15176 emit_move_insn (desthi
, operands
[2]);
15178 emit_move_insn (destlo
, operands
[1]);
15182 /* vec_perm support. */
15184 struct expand_vec_perm_d
15186 rtx target
, op0
, op1
;
15187 vec_perm_indices perm
;
15188 machine_mode vmode
;
15189 unsigned int vec_flags
;
15194 /* Generate a variable permutation. */
15197 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15199 machine_mode vmode
= GET_MODE (target
);
15200 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15202 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
15203 gcc_checking_assert (GET_MODE (op0
) == vmode
);
15204 gcc_checking_assert (GET_MODE (op1
) == vmode
);
15205 gcc_checking_assert (GET_MODE (sel
) == vmode
);
15206 gcc_checking_assert (TARGET_SIMD
);
15210 if (vmode
== V8QImode
)
15212 /* Expand the argument to a V16QI mode by duplicating it. */
15213 rtx pair
= gen_reg_rtx (V16QImode
);
15214 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
15215 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15219 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
15226 if (vmode
== V8QImode
)
15228 pair
= gen_reg_rtx (V16QImode
);
15229 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
15230 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
15234 pair
= gen_reg_rtx (OImode
);
15235 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
15236 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
15241 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15242 NELT is the number of elements in the vector. */
15245 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
15248 machine_mode vmode
= GET_MODE (target
);
15249 bool one_vector_p
= rtx_equal_p (op0
, op1
);
15252 /* The TBL instruction does not use a modulo index, so we must take care
15253 of that ourselves. */
15254 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
15255 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
15256 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
15258 /* For big-endian, we also need to reverse the index within the vector
15259 (but not which vector). */
15260 if (BYTES_BIG_ENDIAN
)
15262 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15264 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15265 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15266 NULL
, 0, OPTAB_LIB_WIDEN
);
15268 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15271 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15274 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15276 emit_insn (gen_rtx_SET (target
,
15277 gen_rtx_UNSPEC (GET_MODE (target
),
15278 gen_rtvec (2, op0
, op1
), code
)));
15281 /* Expand an SVE vec_perm with the given operands. */
15284 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15286 machine_mode data_mode
= GET_MODE (target
);
15287 machine_mode sel_mode
= GET_MODE (sel
);
15288 /* Enforced by the pattern condition. */
15289 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15291 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15292 size of the two value vectors, i.e. the upper bits of the indices
15293 are effectively ignored. SVE TBL instead produces 0 for any
15294 out-of-range indices, so we need to modulo all the vec_perm indices
15295 to ensure they are all in range. */
15296 rtx sel_reg
= force_reg (sel_mode
, sel
);
15298 /* Check if the sel only references the first values vector. */
15299 if (GET_CODE (sel
) == CONST_VECTOR
15300 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15302 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15306 /* Check if the two values vectors are the same. */
15307 if (rtx_equal_p (op0
, op1
))
15309 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15310 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15311 NULL
, 0, OPTAB_DIRECT
);
15312 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15316 /* Run TBL on for each value vector and combine the results. */
15318 rtx res0
= gen_reg_rtx (data_mode
);
15319 rtx res1
= gen_reg_rtx (data_mode
);
15320 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15321 if (GET_CODE (sel
) != CONST_VECTOR
15322 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15324 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15326 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15327 NULL
, 0, OPTAB_DIRECT
);
15329 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15330 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15331 NULL
, 0, OPTAB_DIRECT
);
15332 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15333 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15334 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15336 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15339 /* Recognize patterns suitable for the TRN instructions. */
15341 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15344 poly_uint64 nelt
= d
->perm
.length ();
15345 rtx out
, in0
, in1
, x
;
15346 machine_mode vmode
= d
->vmode
;
15348 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15351 /* Note that these are little-endian tests.
15352 We correct for big-endian later. */
15353 if (!d
->perm
[0].is_constant (&odd
)
15354 || (odd
!= 0 && odd
!= 1)
15355 || !d
->perm
.series_p (0, 2, odd
, 2)
15356 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15365 /* We don't need a big-endian lane correction for SVE; see the comment
15366 at the head of aarch64-sve.md for details. */
15367 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15369 x
= in0
, in0
= in1
, in1
= x
;
15374 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15375 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15379 /* Recognize patterns suitable for the UZP instructions. */
15381 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15384 rtx out
, in0
, in1
, x
;
15385 machine_mode vmode
= d
->vmode
;
15387 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15390 /* Note that these are little-endian tests.
15391 We correct for big-endian later. */
15392 if (!d
->perm
[0].is_constant (&odd
)
15393 || (odd
!= 0 && odd
!= 1)
15394 || !d
->perm
.series_p (0, 1, odd
, 2))
15403 /* We don't need a big-endian lane correction for SVE; see the comment
15404 at the head of aarch64-sve.md for details. */
15405 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15407 x
= in0
, in0
= in1
, in1
= x
;
15412 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15413 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15417 /* Recognize patterns suitable for the ZIP instructions. */
15419 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15422 poly_uint64 nelt
= d
->perm
.length ();
15423 rtx out
, in0
, in1
, x
;
15424 machine_mode vmode
= d
->vmode
;
15426 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15429 /* Note that these are little-endian tests.
15430 We correct for big-endian later. */
15431 poly_uint64 first
= d
->perm
[0];
15432 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15433 || !d
->perm
.series_p (0, 2, first
, 1)
15434 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15436 high
= maybe_ne (first
, 0U);
15444 /* We don't need a big-endian lane correction for SVE; see the comment
15445 at the head of aarch64-sve.md for details. */
15446 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15448 x
= in0
, in0
= in1
, in1
= x
;
15453 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15454 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15458 /* Recognize patterns for the EXT insn. */
15461 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15463 HOST_WIDE_INT location
;
15466 /* The first element always refers to the first vector.
15467 Check if the extracted indices are increasing by one. */
15468 if (d
->vec_flags
== VEC_SVE_PRED
15469 || !d
->perm
[0].is_constant (&location
)
15470 || !d
->perm
.series_p (0, 1, location
, 1))
15477 /* The case where (location == 0) is a no-op for both big- and little-endian,
15478 and is removed by the mid-end at optimization levels -O1 and higher.
15480 We don't need a big-endian lane correction for SVE; see the comment
15481 at the head of aarch64-sve.md for details. */
15482 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15484 /* After setup, we want the high elements of the first vector (stored
15485 at the LSB end of the register), and the low elements of the second
15486 vector (stored at the MSB end of the register). So swap. */
15487 std::swap (d
->op0
, d
->op1
);
15488 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15489 to_constant () is safe since this is restricted to Advanced SIMD
15491 location
= d
->perm
.length ().to_constant () - location
;
15494 offset
= GEN_INT (location
);
15495 emit_set_insn (d
->target
,
15496 gen_rtx_UNSPEC (d
->vmode
,
15497 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15502 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15503 within each 64-bit, 32-bit or 16-bit granule. */
15506 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15508 HOST_WIDE_INT diff
;
15509 unsigned int i
, size
, unspec
;
15510 machine_mode pred_mode
;
15512 if (d
->vec_flags
== VEC_SVE_PRED
15513 || !d
->one_vector_p
15514 || !d
->perm
[0].is_constant (&diff
))
15517 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15520 unspec
= UNSPEC_REV64
;
15521 pred_mode
= VNx2BImode
;
15523 else if (size
== 4)
15525 unspec
= UNSPEC_REV32
;
15526 pred_mode
= VNx4BImode
;
15528 else if (size
== 2)
15530 unspec
= UNSPEC_REV16
;
15531 pred_mode
= VNx8BImode
;
15536 unsigned int step
= diff
+ 1;
15537 for (i
= 0; i
< step
; ++i
)
15538 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15545 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15546 if (d
->vec_flags
== VEC_SVE_DATA
)
15548 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15549 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15550 UNSPEC_MERGE_PTRUE
);
15552 emit_set_insn (d
->target
, src
);
15556 /* Recognize patterns for the REV insn, which reverses elements within
15560 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15562 poly_uint64 nelt
= d
->perm
.length ();
15564 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15567 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15574 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15575 emit_set_insn (d
->target
, src
);
15580 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15582 rtx out
= d
->target
;
15585 machine_mode vmode
= d
->vmode
;
15588 if (d
->vec_flags
== VEC_SVE_PRED
15589 || d
->perm
.encoding ().encoded_nelts () != 1
15590 || !d
->perm
[0].is_constant (&elt
))
15593 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15600 /* The generic preparation in aarch64_expand_vec_perm_const_1
15601 swaps the operand order and the permute indices if it finds
15602 d->perm[0] to be in the second operand. Thus, we can always
15603 use d->op0 and need not do any extra arithmetic to get the
15604 correct lane number. */
15606 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15608 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15609 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15610 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15615 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15617 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15618 machine_mode vmode
= d
->vmode
;
15620 /* Make sure that the indices are constant. */
15621 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15622 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15623 if (!d
->perm
[i
].is_constant ())
15629 /* Generic code will try constant permutation twice. Once with the
15630 original mode and again with the elements lowered to QImode.
15631 So wait and don't do the selector expansion ourselves. */
15632 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15635 /* to_constant is safe since this routine is specific to Advanced SIMD
15637 unsigned int nelt
= d
->perm
.length ().to_constant ();
15638 for (unsigned int i
= 0; i
< nelt
; ++i
)
15639 /* If big-endian and two vectors we end up with a weird mixed-endian
15640 mode on NEON. Reverse the index within each word but not the word
15641 itself. to_constant is safe because we checked is_constant above. */
15642 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15643 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15644 : d
->perm
[i
].to_constant ());
15646 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15647 sel
= force_reg (vmode
, sel
);
15649 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15653 /* Try to implement D using an SVE TBL instruction. */
15656 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15658 unsigned HOST_WIDE_INT nelt
;
15660 /* Permuting two variable-length vectors could overflow the
15662 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15668 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15669 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15670 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15675 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15677 /* The pattern matching functions above are written to look for a small
15678 number to begin the sequence (0, 1, N/2). If we begin with an index
15679 from the second operand, we can swap the operands. */
15680 poly_int64 nelt
= d
->perm
.length ();
15681 if (known_ge (d
->perm
[0], nelt
))
15683 d
->perm
.rotate_inputs (1);
15684 std::swap (d
->op0
, d
->op1
);
15687 if ((d
->vec_flags
== VEC_ADVSIMD
15688 || d
->vec_flags
== VEC_SVE_DATA
15689 || d
->vec_flags
== VEC_SVE_PRED
)
15690 && known_gt (nelt
, 1))
15692 if (aarch64_evpc_rev_local (d
))
15694 else if (aarch64_evpc_rev_global (d
))
15696 else if (aarch64_evpc_ext (d
))
15698 else if (aarch64_evpc_dup (d
))
15700 else if (aarch64_evpc_zip (d
))
15702 else if (aarch64_evpc_uzp (d
))
15704 else if (aarch64_evpc_trn (d
))
15706 if (d
->vec_flags
== VEC_SVE_DATA
)
15707 return aarch64_evpc_sve_tbl (d
);
15708 else if (d
->vec_flags
== VEC_SVE_DATA
)
15709 return aarch64_evpc_tbl (d
);
15714 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15717 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15718 rtx op1
, const vec_perm_indices
&sel
)
15720 struct expand_vec_perm_d d
;
15722 /* Check whether the mask can be applied to a single vector. */
15723 if (op0
&& rtx_equal_p (op0
, op1
))
15724 d
.one_vector_p
= true;
15725 else if (sel
.all_from_input_p (0))
15727 d
.one_vector_p
= true;
15730 else if (sel
.all_from_input_p (1))
15732 d
.one_vector_p
= true;
15736 d
.one_vector_p
= false;
15738 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15739 sel
.nelts_per_input ());
15741 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15745 d
.testing_p
= !target
;
15748 return aarch64_expand_vec_perm_const_1 (&d
);
15750 rtx_insn
*last
= get_last_insn ();
15751 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15752 gcc_assert (last
== get_last_insn ());
15757 /* Generate a byte permute mask for a register of mode MODE,
15758 which has NUNITS units. */
15761 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15763 /* We have to reverse each vector because we dont have
15764 a permuted load that can reverse-load according to ABI rules. */
15766 rtvec v
= rtvec_alloc (16);
15768 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15770 gcc_assert (BYTES_BIG_ENDIAN
);
15771 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15773 for (i
= 0; i
< nunits
; i
++)
15774 for (j
= 0; j
< usize
; j
++)
15775 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15776 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15777 return force_reg (V16QImode
, mask
);
15780 /* Return true if X is a valid second operand for the SVE instruction
15781 that implements integer comparison OP_CODE. */
15784 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15786 if (register_operand (x
, VOIDmode
))
15795 return aarch64_sve_cmp_immediate_p (x
, false);
15802 return aarch64_sve_cmp_immediate_p (x
, true);
15804 gcc_unreachable ();
15808 /* Use predicated SVE instructions to implement the equivalent of:
15812 given that PTRUE is an all-true predicate of the appropriate mode. */
15815 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
15817 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15818 gen_rtvec (2, ptrue
, op
),
15819 UNSPEC_MERGE_PTRUE
);
15820 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
15821 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15824 /* Likewise, but also clobber the condition codes. */
15827 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
15829 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15830 gen_rtvec (2, ptrue
, op
),
15831 UNSPEC_MERGE_PTRUE
);
15832 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
15833 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15836 /* Return the UNSPEC_COND_* code for comparison CODE. */
15838 static unsigned int
15839 aarch64_unspec_cond_code (rtx_code code
)
15844 return UNSPEC_COND_NE
;
15846 return UNSPEC_COND_EQ
;
15848 return UNSPEC_COND_LT
;
15850 return UNSPEC_COND_GT
;
15852 return UNSPEC_COND_LE
;
15854 return UNSPEC_COND_GE
;
15856 gcc_unreachable ();
15862 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15864 where <X> is the operation associated with comparison CODE. This form
15865 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15866 semantics, such as when PRED might not be all-true and when comparing
15867 inactive lanes could have side effects. */
15870 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
15871 rtx pred
, rtx op0
, rtx op1
)
15873 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
15874 gen_rtvec (3, pred
, op0
, op1
),
15875 aarch64_unspec_cond_code (code
));
15876 emit_set_insn (target
, unspec
);
15879 /* Expand an SVE integer comparison using the SVE equivalent of:
15881 (set TARGET (CODE OP0 OP1)). */
15884 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15886 machine_mode pred_mode
= GET_MODE (target
);
15887 machine_mode data_mode
= GET_MODE (op0
);
15889 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15890 op1
= force_reg (data_mode
, op1
);
15892 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15893 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15894 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
15897 /* Emit the SVE equivalent of:
15899 (set TMP1 (CODE1 OP0 OP1))
15900 (set TMP2 (CODE2 OP0 OP1))
15901 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15903 PTRUE is an all-true predicate with the same mode as TARGET. */
15906 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
15907 rtx ptrue
, rtx op0
, rtx op1
)
15909 machine_mode pred_mode
= GET_MODE (ptrue
);
15910 rtx tmp1
= gen_reg_rtx (pred_mode
);
15911 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
15912 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
15913 rtx tmp2
= gen_reg_rtx (pred_mode
);
15914 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
15915 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
15916 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
15919 /* Emit the SVE equivalent of:
15921 (set TMP (CODE OP0 OP1))
15922 (set TARGET (not TMP))
15924 PTRUE is an all-true predicate with the same mode as TARGET. */
15927 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
15930 machine_mode pred_mode
= GET_MODE (ptrue
);
15931 rtx tmp
= gen_reg_rtx (pred_mode
);
15932 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
15933 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
15934 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15937 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15939 (set TARGET (CODE OP0 OP1))
15941 If CAN_INVERT_P is true, the caller can also handle inverted results;
15942 return true if the result is in fact inverted. */
15945 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15946 rtx op0
, rtx op1
, bool can_invert_p
)
15948 machine_mode pred_mode
= GET_MODE (target
);
15949 machine_mode data_mode
= GET_MODE (op0
);
15951 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15955 /* UNORDERED has no immediate form. */
15956 op1
= force_reg (data_mode
, op1
);
15965 /* There is native support for the comparison. */
15966 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15967 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15972 /* This is a trapping operation (LT or GT). */
15973 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
15977 if (!flag_trapping_math
)
15979 /* This would trap for signaling NaNs. */
15980 op1
= force_reg (data_mode
, op1
);
15981 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
15989 if (flag_trapping_math
)
15991 /* Work out which elements are ordered. */
15992 rtx ordered
= gen_reg_rtx (pred_mode
);
15993 op1
= force_reg (data_mode
, op1
);
15994 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
15996 /* Test the opposite condition for the ordered elements,
15997 then invert the result. */
16001 code
= reverse_condition_maybe_unordered (code
);
16004 aarch64_emit_sve_predicated_cond (target
, code
,
16005 ordered
, op0
, op1
);
16008 rtx tmp
= gen_reg_rtx (pred_mode
);
16009 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
16010 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
16016 /* ORDERED has no immediate form. */
16017 op1
= force_reg (data_mode
, op1
);
16021 gcc_unreachable ();
16024 /* There is native support for the inverse comparison. */
16025 code
= reverse_condition_maybe_unordered (code
);
16028 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
16029 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
16032 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
16036 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16037 of the data being selected and CMP_MODE is the mode of the values being
16041 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
16044 machine_mode pred_mode
16045 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
16046 GET_MODE_SIZE (cmp_mode
)).require ();
16047 rtx pred
= gen_reg_rtx (pred_mode
);
16048 if (FLOAT_MODE_P (cmp_mode
))
16050 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
16051 ops
[4], ops
[5], true))
16052 std::swap (ops
[1], ops
[2]);
16055 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
16057 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
16058 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
16061 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16062 true. However due to issues with register allocation it is preferable
16063 to avoid tieing integer scalar and FP scalar modes. Executing integer
16064 operations in general registers is better than treating them as scalar
16065 vector operations. This reduces latency and avoids redundant int<->FP
16066 moves. So tie modes if they are either the same class, or vector modes
16067 with other vector modes, vector structs or any scalar mode. */
16070 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
16072 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
16075 /* We specifically want to allow elements of "structure" modes to
16076 be tieable to the structure. This more general condition allows
16077 other rarer situations too. The reason we don't extend this to
16078 predicate modes is that there are no predicate structure modes
16079 nor any specific instructions for extracting part of a predicate
16081 if (aarch64_vector_data_mode_p (mode1
)
16082 && aarch64_vector_data_mode_p (mode2
))
16085 /* Also allow any scalar modes with vectors. */
16086 if (aarch64_vector_mode_supported_p (mode1
)
16087 || aarch64_vector_mode_supported_p (mode2
))
16093 /* Return a new RTX holding the result of moving POINTER forward by
16097 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
16099 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
16101 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
16105 /* Return a new RTX holding the result of moving POINTER forward by the
16106 size of the mode it points to. */
16109 aarch64_progress_pointer (rtx pointer
)
16111 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
16114 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16118 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
16121 rtx reg
= gen_reg_rtx (mode
);
16123 /* "Cast" the pointers to the correct mode. */
16124 *src
= adjust_address (*src
, mode
, 0);
16125 *dst
= adjust_address (*dst
, mode
, 0);
16126 /* Emit the memcpy. */
16127 emit_move_insn (reg
, *src
);
16128 emit_move_insn (*dst
, reg
);
16129 /* Move the pointers forward. */
16130 *src
= aarch64_progress_pointer (*src
);
16131 *dst
= aarch64_progress_pointer (*dst
);
16134 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16135 we succeed, otherwise return false. */
16138 aarch64_expand_movmem (rtx
*operands
)
16141 rtx dst
= operands
[0];
16142 rtx src
= operands
[1];
16144 bool speed_p
= !optimize_function_for_size_p (cfun
);
16146 /* When optimizing for size, give a better estimate of the length of a
16147 memcpy call, but use the default otherwise. */
16148 unsigned int max_instructions
= (speed_p
? 15 : AARCH64_CALL_RATIO
) / 2;
16150 /* We can't do anything smart if the amount to copy is not constant. */
16151 if (!CONST_INT_P (operands
[2]))
16154 n
= UINTVAL (operands
[2]);
16156 /* Try to keep the number of instructions low. For cases below 16 bytes we
16157 need to make at most two moves. For cases above 16 bytes it will be one
16158 move for each 16 byte chunk, then at most two additional moves. */
16159 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_instructions
)
16162 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
16163 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
16165 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
16166 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
16168 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16174 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
16179 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
16184 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16185 4-byte chunk, partially overlapping with the previously copied chunk. */
16188 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16194 src
= aarch64_move_pointer (src
, move
);
16195 dst
= aarch64_move_pointer (dst
, move
);
16196 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16201 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16202 them, then (if applicable) an 8-byte chunk. */
16207 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, TImode
);
16212 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
16217 /* Finish the final bytes of the copy. We can always do this in one
16218 instruction. We either copy the exact amount we need, or partially
16219 overlap with the previous chunk we copied and copy 8-bytes. */
16223 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, QImode
);
16225 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, HImode
);
16227 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16232 src
= aarch64_move_pointer (src
, -1);
16233 dst
= aarch64_move_pointer (dst
, -1);
16234 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, SImode
);
16240 src
= aarch64_move_pointer (src
, move
);
16241 dst
= aarch64_move_pointer (dst
, move
);
16242 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, DImode
);
16249 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16250 SImode stores. Handle the case when the constant has identical
16251 bottom and top halves. This is beneficial when the two stores can be
16252 merged into an STP and we avoid synthesising potentially expensive
16253 immediates twice. Return true if such a split is possible. */
16256 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
16258 rtx lo
= gen_lowpart (SImode
, src
);
16259 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
16261 bool size_p
= optimize_function_for_size_p (cfun
);
16263 if (!rtx_equal_p (lo
, hi
))
16266 unsigned int orig_cost
16267 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
16268 unsigned int lo_cost
16269 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
16271 /* We want to transform:
16273 MOVK x1, 0x140, lsl 16
16274 MOVK x1, 0xc0da, lsl 32
16275 MOVK x1, 0x140, lsl 48
16279 MOVK w1, 0x140, lsl 16
16281 So we want to perform this only when we save two instructions
16282 or more. When optimizing for size, however, accept any code size
16284 if (size_p
&& orig_cost
<= lo_cost
)
16288 && (orig_cost
<= lo_cost
+ 1))
16291 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
16292 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
16295 rtx tmp_reg
= gen_reg_rtx (SImode
);
16296 aarch64_expand_mov_immediate (tmp_reg
, lo
);
16297 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
16298 /* Don't emit an explicit store pair as this may not be always profitable.
16299 Let the sched-fusion logic decide whether to merge them. */
16300 emit_move_insn (mem_lo
, tmp_reg
);
16301 emit_move_insn (mem_hi
, tmp_reg
);
16306 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16308 static unsigned HOST_WIDE_INT
16309 aarch64_asan_shadow_offset (void)
16311 return (HOST_WIDE_INT_1
<< 36);
16315 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16316 int code
, tree treeop0
, tree treeop1
)
16318 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16320 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16322 struct expand_operand ops
[4];
16325 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16327 op_mode
= GET_MODE (op0
);
16328 if (op_mode
== VOIDmode
)
16329 op_mode
= GET_MODE (op1
);
16337 icode
= CODE_FOR_cmpsi
;
16342 icode
= CODE_FOR_cmpdi
;
16347 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16348 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
16353 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16354 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
16362 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
16363 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
16369 *prep_seq
= get_insns ();
16372 create_fixed_operand (&ops
[0], op0
);
16373 create_fixed_operand (&ops
[1], op1
);
16376 if (!maybe_expand_insn (icode
, 2, ops
))
16381 *gen_seq
= get_insns ();
16384 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
16385 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
16389 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
16390 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
16392 rtx op0
, op1
, target
;
16393 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16394 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16396 struct expand_operand ops
[6];
16399 push_to_sequence (*prep_seq
);
16400 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16402 op_mode
= GET_MODE (op0
);
16403 if (op_mode
== VOIDmode
)
16404 op_mode
= GET_MODE (op1
);
16412 icode
= CODE_FOR_ccmpsi
;
16417 icode
= CODE_FOR_ccmpdi
;
16422 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16423 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16428 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16429 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16437 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16438 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16444 *prep_seq
= get_insns ();
16447 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16448 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16450 if (bit_code
!= AND
)
16452 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16453 GET_MODE (XEXP (prev
, 0))),
16454 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16455 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16458 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16459 create_fixed_operand (&ops
[1], target
);
16460 create_fixed_operand (&ops
[2], op0
);
16461 create_fixed_operand (&ops
[3], op1
);
16462 create_fixed_operand (&ops
[4], prev
);
16463 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16465 push_to_sequence (*gen_seq
);
16466 if (!maybe_expand_insn (icode
, 6, ops
))
16472 *gen_seq
= get_insns ();
16475 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16478 #undef TARGET_GEN_CCMP_FIRST
16479 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16481 #undef TARGET_GEN_CCMP_NEXT
16482 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16484 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16485 instruction fusion of some sort. */
16488 aarch64_macro_fusion_p (void)
16490 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16494 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16495 should be kept together during scheduling. */
16498 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16501 rtx prev_set
= single_set (prev
);
16502 rtx curr_set
= single_set (curr
);
16503 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16504 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16506 if (!aarch64_macro_fusion_p ())
16509 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16511 /* We are trying to match:
16512 prev (mov) == (set (reg r0) (const_int imm16))
16513 curr (movk) == (set (zero_extract (reg r0)
16516 (const_int imm16_1)) */
16518 set_dest
= SET_DEST (curr_set
);
16520 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16521 && CONST_INT_P (SET_SRC (curr_set
))
16522 && CONST_INT_P (SET_SRC (prev_set
))
16523 && CONST_INT_P (XEXP (set_dest
, 2))
16524 && INTVAL (XEXP (set_dest
, 2)) == 16
16525 && REG_P (XEXP (set_dest
, 0))
16526 && REG_P (SET_DEST (prev_set
))
16527 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16533 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16536 /* We're trying to match:
16537 prev (adrp) == (set (reg r1)
16538 (high (symbol_ref ("SYM"))))
16539 curr (add) == (set (reg r0)
16541 (symbol_ref ("SYM"))))
16542 Note that r0 need not necessarily be the same as r1, especially
16543 during pre-regalloc scheduling. */
16545 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16546 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16548 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16549 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16550 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16551 == REGNO (SET_DEST (prev_set
))
16552 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16553 XEXP (SET_SRC (curr_set
), 1)))
16558 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16561 /* We're trying to match:
16562 prev (movk) == (set (zero_extract (reg r0)
16565 (const_int imm16_1))
16566 curr (movk) == (set (zero_extract (reg r0)
16569 (const_int imm16_2)) */
16571 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16572 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16573 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16574 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16575 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16576 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16577 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16578 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16579 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16580 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16581 && CONST_INT_P (SET_SRC (prev_set
))
16582 && CONST_INT_P (SET_SRC (curr_set
)))
16586 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16588 /* We're trying to match:
16589 prev (adrp) == (set (reg r0)
16590 (high (symbol_ref ("SYM"))))
16591 curr (ldr) == (set (reg r1)
16592 (mem (lo_sum (reg r0)
16593 (symbol_ref ("SYM")))))
16595 curr (ldr) == (set (reg r1)
16598 (symbol_ref ("SYM")))))) */
16599 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16600 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16602 rtx curr_src
= SET_SRC (curr_set
);
16604 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16605 curr_src
= XEXP (curr_src
, 0);
16607 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16608 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16609 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16610 == REGNO (SET_DEST (prev_set
))
16611 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16612 XEXP (SET_SRC (prev_set
), 0)))
16617 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16618 && aarch_crypto_can_dual_issue (prev
, curr
))
16621 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16622 && any_condjump_p (curr
))
16624 enum attr_type prev_type
= get_attr_type (prev
);
16626 unsigned int condreg1
, condreg2
;
16628 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16629 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16631 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16633 && modified_in_p (cc_reg_1
, prev
))
16635 /* FIXME: this misses some which is considered simple arthematic
16636 instructions for ThunderX. Simple shifts are missed here. */
16637 if (prev_type
== TYPE_ALUS_SREG
16638 || prev_type
== TYPE_ALUS_IMM
16639 || prev_type
== TYPE_LOGICS_REG
16640 || prev_type
== TYPE_LOGICS_IMM
)
16647 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16648 && any_condjump_p (curr
))
16650 /* We're trying to match:
16651 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16652 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16654 (label_ref ("SYM"))
16656 if (SET_DEST (curr_set
) == (pc_rtx
)
16657 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16658 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16659 && REG_P (SET_DEST (prev_set
))
16660 && REGNO (SET_DEST (prev_set
))
16661 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16663 /* Fuse ALU operations followed by conditional branch instruction. */
16664 switch (get_attr_type (prev
))
16667 case TYPE_ALU_SREG
:
16670 case TYPE_ADCS_REG
:
16671 case TYPE_ADCS_IMM
:
16672 case TYPE_LOGIC_REG
:
16673 case TYPE_LOGIC_IMM
:
16677 case TYPE_SHIFT_REG
:
16678 case TYPE_SHIFT_IMM
:
16693 /* Return true iff the instruction fusion described by OP is enabled. */
16696 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16698 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16701 /* If MEM is in the form of [base+offset], extract the two parts
16702 of address and set to BASE and OFFSET, otherwise return false
16703 after clearing BASE and OFFSET. */
16706 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16710 gcc_assert (MEM_P (mem
));
16712 addr
= XEXP (mem
, 0);
16717 *offset
= const0_rtx
;
16721 if (GET_CODE (addr
) == PLUS
16722 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16724 *base
= XEXP (addr
, 0);
16725 *offset
= XEXP (addr
, 1);
16730 *offset
= NULL_RTX
;
16735 /* Types for scheduling fusion. */
16736 enum sched_fusion_type
16738 SCHED_FUSION_NONE
= 0,
16739 SCHED_FUSION_LD_SIGN_EXTEND
,
16740 SCHED_FUSION_LD_ZERO_EXTEND
,
16746 /* If INSN is a load or store of address in the form of [base+offset],
16747 extract the two parts and set to BASE and OFFSET. Return scheduling
16748 fusion type this INSN is. */
16750 static enum sched_fusion_type
16751 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16754 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16756 gcc_assert (INSN_P (insn
));
16757 x
= PATTERN (insn
);
16758 if (GET_CODE (x
) != SET
)
16759 return SCHED_FUSION_NONE
;
16762 dest
= SET_DEST (x
);
16764 machine_mode dest_mode
= GET_MODE (dest
);
16766 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16767 return SCHED_FUSION_NONE
;
16769 if (GET_CODE (src
) == SIGN_EXTEND
)
16771 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16772 src
= XEXP (src
, 0);
16773 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16774 return SCHED_FUSION_NONE
;
16776 else if (GET_CODE (src
) == ZERO_EXTEND
)
16778 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16779 src
= XEXP (src
, 0);
16780 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16781 return SCHED_FUSION_NONE
;
16784 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16785 extract_base_offset_in_addr (src
, base
, offset
);
16786 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16788 fusion
= SCHED_FUSION_ST
;
16789 extract_base_offset_in_addr (dest
, base
, offset
);
16792 return SCHED_FUSION_NONE
;
16794 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16795 fusion
= SCHED_FUSION_NONE
;
16800 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16802 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16803 and PRI are only calculated for these instructions. For other instruction,
16804 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16805 type instruction fusion can be added by returning different priorities.
16807 It's important that irrelevant instructions get the largest FUSION_PRI. */
16810 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16811 int *fusion_pri
, int *pri
)
16815 enum sched_fusion_type fusion
;
16817 gcc_assert (INSN_P (insn
));
16820 fusion
= fusion_load_store (insn
, &base
, &offset
);
16821 if (fusion
== SCHED_FUSION_NONE
)
16828 /* Set FUSION_PRI according to fusion type and base register. */
16829 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16831 /* Calculate PRI. */
16834 /* INSN with smaller offset goes first. */
16835 off_val
= (int)(INTVAL (offset
));
16837 tmp
-= (off_val
& 0xfffff);
16839 tmp
+= ((- off_val
) & 0xfffff);
16845 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16846 Adjust priority of sha1h instructions so they are scheduled before
16847 other SHA1 instructions. */
16850 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16852 rtx x
= PATTERN (insn
);
16854 if (GET_CODE (x
) == SET
)
16858 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16859 return priority
+ 10;
16865 /* Given OPERANDS of consecutive load/store, check if we can merge
16866 them into ldp/stp. LOAD is true if they are load instructions.
16867 MODE is the mode of memory operands. */
16870 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16873 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16874 enum reg_class rclass_1
, rclass_2
;
16875 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16879 mem_1
= operands
[1];
16880 mem_2
= operands
[3];
16881 reg_1
= operands
[0];
16882 reg_2
= operands
[2];
16883 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16884 if (REGNO (reg_1
) == REGNO (reg_2
))
16889 mem_1
= operands
[0];
16890 mem_2
= operands
[2];
16891 reg_1
= operands
[1];
16892 reg_2
= operands
[3];
16895 /* The mems cannot be volatile. */
16896 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16899 /* If we have SImode and slow unaligned ldp,
16900 check the alignment to be at least 8 byte. */
16902 && (aarch64_tune_params
.extra_tuning_flags
16903 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16905 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16908 /* Check if the addresses are in the form of [base+offset]. */
16909 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16910 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16912 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16913 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16916 /* Check if the bases are same. */
16917 if (!rtx_equal_p (base_1
, base_2
))
16920 /* The operands must be of the same size. */
16921 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
16922 GET_MODE_SIZE (GET_MODE (mem_2
))));
16924 offval_1
= INTVAL (offset_1
);
16925 offval_2
= INTVAL (offset_2
);
16926 /* We should only be trying this for fixed-sized modes. There is no
16927 SVE LDP/STP instruction. */
16928 msize
= GET_MODE_SIZE (mode
).to_constant ();
16929 /* Check if the offsets are consecutive. */
16930 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16933 /* Check if the addresses are clobbered by load. */
16936 if (reg_mentioned_p (reg_1
, mem_1
))
16939 /* In increasing order, the last load can clobber the address. */
16940 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16944 /* One of the memory accesses must be a mempair operand.
16945 If it is not the first one, they need to be swapped by the
16947 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
16948 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
16951 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16952 rclass_1
= FP_REGS
;
16954 rclass_1
= GENERAL_REGS
;
16956 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16957 rclass_2
= FP_REGS
;
16959 rclass_2
= GENERAL_REGS
;
16961 /* Check if the registers are of same class. */
16962 if (rclass_1
!= rclass_2
)
16968 /* Given OPERANDS of consecutive load/store that can be merged,
16969 swap them if they are not in ascending order. */
16971 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
16973 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
16974 HOST_WIDE_INT offval_1
, offval_2
;
16978 mem_1
= operands
[1];
16979 mem_2
= operands
[3];
16983 mem_1
= operands
[0];
16984 mem_2
= operands
[2];
16987 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16988 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16990 offval_1
= INTVAL (offset_1
);
16991 offval_2
= INTVAL (offset_2
);
16993 if (offval_1
> offval_2
)
16995 /* Irrespective of whether this is a load or a store,
16996 we do the same swap. */
16997 std::swap (operands
[0], operands
[2]);
16998 std::swap (operands
[1], operands
[3]);
17002 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17003 comparison between the two. */
17005 aarch64_host_wide_int_compare (const void *x
, const void *y
)
17007 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
17008 * ((const HOST_WIDE_INT
*) y
));
17011 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17012 other pointing to a REG rtx containing an offset, compare the offsets
17017 1 iff offset (X) > offset (Y)
17018 0 iff offset (X) == offset (Y)
17019 -1 iff offset (X) < offset (Y) */
17021 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
17023 const rtx
* operands_1
= (const rtx
*) x
;
17024 const rtx
* operands_2
= (const rtx
*) y
;
17025 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
17027 if (MEM_P (operands_1
[0]))
17028 mem_1
= operands_1
[0];
17030 mem_1
= operands_1
[1];
17032 if (MEM_P (operands_2
[0]))
17033 mem_2
= operands_2
[0];
17035 mem_2
= operands_2
[1];
17037 /* Extract the offsets. */
17038 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17039 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
17041 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
17043 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
17046 /* Given OPERANDS of consecutive load/store, check if we can merge
17047 them into ldp/stp by adjusting the offset. LOAD is true if they
17048 are load instructions. MODE is the mode of memory operands.
17050 Given below consecutive stores:
17052 str w1, [xb, 0x100]
17053 str w1, [xb, 0x104]
17054 str w1, [xb, 0x108]
17055 str w1, [xb, 0x10c]
17057 Though the offsets are out of the range supported by stp, we can
17058 still pair them after adjusting the offset, like:
17060 add scratch, xb, 0x100
17061 stp w1, w1, [scratch]
17062 stp w1, w1, [scratch, 0x8]
17064 The peephole patterns detecting this opportunity should guarantee
17065 the scratch register is avaliable. */
17068 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
17071 enum reg_class rclass_1
, rclass_2
, rclass_3
, rclass_4
;
17072 HOST_WIDE_INT offvals
[4], msize
;
17073 rtx mem_1
, mem_2
, mem_3
, mem_4
, reg_1
, reg_2
, reg_3
, reg_4
;
17074 rtx base_1
, base_2
, base_3
, base_4
, offset_1
, offset_2
, offset_3
, offset_4
;
17078 reg_1
= operands
[0];
17079 mem_1
= operands
[1];
17080 reg_2
= operands
[2];
17081 mem_2
= operands
[3];
17082 reg_3
= operands
[4];
17083 mem_3
= operands
[5];
17084 reg_4
= operands
[6];
17085 mem_4
= operands
[7];
17086 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
)
17087 && REG_P (reg_3
) && REG_P (reg_4
));
17089 /* Do not attempt to merge the loads if the loads clobber each other. */
17090 for (int i
= 0; i
< 8; i
+= 2)
17091 for (int j
= i
+ 2; j
< 8; j
+= 2)
17092 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
17097 mem_1
= operands
[0];
17098 reg_1
= operands
[1];
17099 mem_2
= operands
[2];
17100 reg_2
= operands
[3];
17101 mem_3
= operands
[4];
17102 reg_3
= operands
[5];
17103 mem_4
= operands
[6];
17104 reg_4
= operands
[7];
17106 /* Skip if memory operand is by itslef valid for ldp/stp. */
17107 if (!MEM_P (mem_1
) || aarch64_mem_pair_operand (mem_1
, mode
))
17110 /* The mems cannot be volatile. */
17111 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
)
17112 || MEM_VOLATILE_P (mem_3
) ||MEM_VOLATILE_P (mem_4
))
17115 /* Check if the addresses are in the form of [base+offset]. */
17116 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
17117 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
17119 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
17120 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
17122 extract_base_offset_in_addr (mem_3
, &base_3
, &offset_3
);
17123 if (base_3
== NULL_RTX
|| offset_3
== NULL_RTX
)
17125 extract_base_offset_in_addr (mem_4
, &base_4
, &offset_4
);
17126 if (base_4
== NULL_RTX
|| offset_4
== NULL_RTX
)
17129 /* Check if the bases are same. */
17130 if (!rtx_equal_p (base_1
, base_2
)
17131 || !rtx_equal_p (base_2
, base_3
)
17132 || !rtx_equal_p (base_3
, base_4
))
17135 offvals
[0] = INTVAL (offset_1
);
17136 offvals
[1] = INTVAL (offset_2
);
17137 offvals
[2] = INTVAL (offset_3
);
17138 offvals
[3] = INTVAL (offset_4
);
17139 msize
= GET_MODE_SIZE (mode
);
17141 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17142 qsort (offvals
, 4, sizeof (HOST_WIDE_INT
), aarch64_host_wide_int_compare
);
17144 if (!(offvals
[1] == offvals
[0] + msize
17145 && offvals
[3] == offvals
[2] + msize
))
17148 /* Check that offsets are within range of each other. The ldp/stp
17149 instructions have 7 bit immediate offsets, so use 0x80. */
17150 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
17153 /* The offsets must be aligned with respect to each other. */
17154 if (offvals
[0] % msize
!= offvals
[2] % msize
)
17157 /* Check if the addresses are clobbered by load. */
17158 if (load
&& (reg_mentioned_p (reg_1
, mem_1
)
17159 || reg_mentioned_p (reg_2
, mem_2
)
17160 || reg_mentioned_p (reg_3
, mem_3
)
17161 || reg_mentioned_p (reg_4
, mem_4
)))
17164 /* If we have SImode and slow unaligned ldp,
17165 check the alignment to be at least 8 byte. */
17167 && (aarch64_tune_params
.extra_tuning_flags
17168 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
17170 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
17173 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
17174 rclass_1
= FP_REGS
;
17176 rclass_1
= GENERAL_REGS
;
17178 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
17179 rclass_2
= FP_REGS
;
17181 rclass_2
= GENERAL_REGS
;
17183 if (REG_P (reg_3
) && FP_REGNUM_P (REGNO (reg_3
)))
17184 rclass_3
= FP_REGS
;
17186 rclass_3
= GENERAL_REGS
;
17188 if (REG_P (reg_4
) && FP_REGNUM_P (REGNO (reg_4
)))
17189 rclass_4
= FP_REGS
;
17191 rclass_4
= GENERAL_REGS
;
17193 /* Check if the registers are of same class. */
17194 if (rclass_1
!= rclass_2
|| rclass_2
!= rclass_3
|| rclass_3
!= rclass_4
)
17200 /* Given OPERANDS of consecutive load/store, this function pairs them
17201 into LDP/STP after adjusting the offset. It depends on the fact
17202 that the operands can be sorted so the offsets are correct for STP.
17203 MODE is the mode of memory operands. CODE is the rtl operator
17204 which should be applied to all memory operands, it's SIGN_EXTEND,
17205 ZERO_EXTEND or UNKNOWN. */
17208 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17209 scalar_mode mode
, RTX_CODE code
)
17211 rtx base
, offset_1
, offset_3
, t1
, t2
;
17212 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17213 rtx temp_operands
[8];
17214 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17215 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17217 /* We make changes on a copy as we may still bail out. */
17218 for (int i
= 0; i
< 8; i
++)
17219 temp_operands
[i
] = operands
[i
];
17221 /* Sort the operands. */
17222 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17226 mem_1
= temp_operands
[1];
17227 mem_2
= temp_operands
[3];
17228 mem_3
= temp_operands
[5];
17229 mem_4
= temp_operands
[7];
17233 mem_1
= temp_operands
[0];
17234 mem_2
= temp_operands
[2];
17235 mem_3
= temp_operands
[4];
17236 mem_4
= temp_operands
[6];
17237 gcc_assert (code
== UNKNOWN
);
17240 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17241 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17242 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17243 && offset_3
!= NULL_RTX
);
17245 /* Adjust offset so it can fit in LDP/STP instruction. */
17246 msize
= GET_MODE_SIZE (mode
);
17247 stp_off_upper_limit
= msize
* (0x40 - 1);
17248 stp_off_lower_limit
= - msize
* 0x40;
17250 off_val_1
= INTVAL (offset_1
);
17251 off_val_3
= INTVAL (offset_3
);
17253 /* The base offset is optimally half way between the two STP/LDP offsets. */
17255 base_off
= (off_val_1
+ off_val_3
) / 2;
17257 /* However, due to issues with negative LDP/STP offset generation for
17258 larger modes, for DF, DI and vector modes. we must not use negative
17259 addresses smaller than 9 signed unadjusted bits can store. This
17260 provides the most range in this case. */
17261 base_off
= off_val_1
;
17263 /* Adjust the base so that it is aligned with the addresses but still
17265 if (base_off
% msize
!= off_val_1
% msize
)
17266 /* Fix the offset, bearing in mind we want to make it bigger not
17268 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17269 else if (msize
<= 4)
17270 /* The negative range of LDP/STP is one larger than the positive range. */
17273 /* Check if base offset is too big or too small. We can attempt to resolve
17274 this issue by setting it to the maximum value and seeing if the offsets
17276 if (base_off
>= 0x1000)
17278 base_off
= 0x1000 - 1;
17279 /* We must still make sure that the base offset is aligned with respect
17280 to the address. But it may may not be made any bigger. */
17281 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17284 /* Likewise for the case where the base is too small. */
17285 if (base_off
<= -0x1000)
17287 base_off
= -0x1000 + 1;
17288 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17291 /* Offset of the first STP/LDP. */
17292 new_off_1
= off_val_1
- base_off
;
17294 /* Offset of the second STP/LDP. */
17295 new_off_3
= off_val_3
- base_off
;
17297 /* The offsets must be within the range of the LDP/STP instructions. */
17298 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
17299 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
17302 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
17304 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
17305 new_off_1
+ msize
), true);
17306 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
17308 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
17309 new_off_3
+ msize
), true);
17311 if (!aarch64_mem_pair_operand (mem_1
, mode
)
17312 || !aarch64_mem_pair_operand (mem_3
, mode
))
17315 if (code
== ZERO_EXTEND
)
17317 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17318 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17319 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17320 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17322 else if (code
== SIGN_EXTEND
)
17324 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17325 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17326 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17327 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17332 operands
[0] = temp_operands
[0];
17333 operands
[1] = mem_1
;
17334 operands
[2] = temp_operands
[2];
17335 operands
[3] = mem_2
;
17336 operands
[4] = temp_operands
[4];
17337 operands
[5] = mem_3
;
17338 operands
[6] = temp_operands
[6];
17339 operands
[7] = mem_4
;
17343 operands
[0] = mem_1
;
17344 operands
[1] = temp_operands
[1];
17345 operands
[2] = mem_2
;
17346 operands
[3] = temp_operands
[3];
17347 operands
[4] = mem_3
;
17348 operands
[5] = temp_operands
[5];
17349 operands
[6] = mem_4
;
17350 operands
[7] = temp_operands
[7];
17353 /* Emit adjusting instruction. */
17354 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
17355 /* Emit ldp/stp instructions. */
17356 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17357 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17358 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17359 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17360 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17361 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17365 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17366 it isn't worth branching around empty masked ops (including masked
17370 aarch64_empty_mask_is_expensive (unsigned)
17375 /* Return 1 if pseudo register should be created and used to hold
17376 GOT address for PIC code. */
17379 aarch64_use_pseudo_pic_reg (void)
17381 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
17384 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17387 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
17389 switch (XINT (x
, 1))
17391 case UNSPEC_GOTSMALLPIC
:
17392 case UNSPEC_GOTSMALLPIC28K
:
17393 case UNSPEC_GOTTINYPIC
:
17399 return default_unspec_may_trap_p (x
, flags
);
17403 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17404 return the log2 of that value. Otherwise return -1. */
17407 aarch64_fpconst_pow_of_2 (rtx x
)
17409 const REAL_VALUE_TYPE
*r
;
17411 if (!CONST_DOUBLE_P (x
))
17414 r
= CONST_DOUBLE_REAL_VALUE (x
);
17416 if (REAL_VALUE_NEGATIVE (*r
)
17417 || REAL_VALUE_ISNAN (*r
)
17418 || REAL_VALUE_ISINF (*r
)
17419 || !real_isinteger (r
, DFmode
))
17422 return exact_log2 (real_to_integer (r
));
17425 /* If X is a vector of equal CONST_DOUBLE values and that value is
17426 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17429 aarch64_vec_fpconst_pow_of_2 (rtx x
)
17432 if (GET_CODE (x
) != CONST_VECTOR
17433 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
17436 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
17439 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
17443 for (int i
= 1; i
< nelts
; i
++)
17444 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
17450 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17453 __fp16 always promotes through this hook.
17454 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17455 through the generic excess precision logic rather than here. */
17458 aarch64_promoted_type (const_tree t
)
17460 if (SCALAR_FLOAT_TYPE_P (t
)
17461 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
17462 return float_type_node
;
17467 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17470 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
17471 optimization_type opt_type
)
17476 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
17483 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17485 static unsigned int
17486 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
17489 /* Polynomial invariant 1 == (VG / 2) - 1. */
17490 gcc_assert (i
== 1);
17493 return AARCH64_DWARF_VG
;
17496 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17497 if MODE is HFmode, and punt to the generic implementation otherwise. */
17500 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
17502 return (mode
== HFmode
17504 : default_libgcc_floating_mode_supported_p (mode
));
17507 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17508 if MODE is HFmode, and punt to the generic implementation otherwise. */
17511 aarch64_scalar_mode_supported_p (scalar_mode mode
)
17513 return (mode
== HFmode
17515 : default_scalar_mode_supported_p (mode
));
17518 /* Set the value of FLT_EVAL_METHOD.
17519 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17521 0: evaluate all operations and constants, whose semantic type has at
17522 most the range and precision of type float, to the range and
17523 precision of float; evaluate all other operations and constants to
17524 the range and precision of the semantic type;
17526 N, where _FloatN is a supported interchange floating type
17527 evaluate all operations and constants, whose semantic type has at
17528 most the range and precision of _FloatN type, to the range and
17529 precision of the _FloatN type; evaluate all other operations and
17530 constants to the range and precision of the semantic type;
17532 If we have the ARMv8.2-A extensions then we support _Float16 in native
17533 precision, so we should set this to 16. Otherwise, we support the type,
17534 but want to evaluate expressions in float precision, so set this to
17537 static enum flt_eval_method
17538 aarch64_excess_precision (enum excess_precision_type type
)
17542 case EXCESS_PRECISION_TYPE_FAST
:
17543 case EXCESS_PRECISION_TYPE_STANDARD
:
17544 /* We can calculate either in 16-bit range and precision or
17545 32-bit range and precision. Make that decision based on whether
17546 we have native support for the ARMv8.2-A 16-bit floating-point
17547 instructions or not. */
17548 return (TARGET_FP_F16INST
17549 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17550 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
17551 case EXCESS_PRECISION_TYPE_IMPLICIT
:
17552 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
17554 gcc_unreachable ();
17556 return FLT_EVAL_METHOD_UNPREDICTABLE
;
17559 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17560 scheduled for speculative execution. Reject the long-running division
17561 and square-root instructions. */
17564 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17566 switch (get_attr_type (insn
))
17574 case TYPE_NEON_FP_SQRT_S
:
17575 case TYPE_NEON_FP_SQRT_D
:
17576 case TYPE_NEON_FP_SQRT_S_Q
:
17577 case TYPE_NEON_FP_SQRT_D_Q
:
17578 case TYPE_NEON_FP_DIV_S
:
17579 case TYPE_NEON_FP_DIV_D
:
17580 case TYPE_NEON_FP_DIV_S_Q
:
17581 case TYPE_NEON_FP_DIV_D_Q
:
17588 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17591 aarch64_compute_pressure_classes (reg_class
*classes
)
17594 classes
[i
++] = GENERAL_REGS
;
17595 classes
[i
++] = FP_REGS
;
17596 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17597 registers need to go in PR_LO_REGS at some point during their
17598 lifetime. Splitting it into two halves has the effect of making
17599 all predicates count against PR_LO_REGS, so that we try whenever
17600 possible to restrict the number of live predicates to 8. This
17601 greatly reduces the amount of spilling in certain loops. */
17602 classes
[i
++] = PR_LO_REGS
;
17603 classes
[i
++] = PR_HI_REGS
;
17607 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17610 aarch64_can_change_mode_class (machine_mode from
,
17611 machine_mode to
, reg_class_t
)
17613 if (BYTES_BIG_ENDIAN
)
17615 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
17616 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
17618 /* Don't allow changes between SVE data modes and non-SVE modes.
17619 See the comment at the head of aarch64-sve.md for details. */
17620 if (from_sve_p
!= to_sve_p
)
17623 /* Don't allow changes in element size: lane 0 of the new vector
17624 would not then be lane 0 of the old vector. See the comment
17625 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17628 In the worst case, this forces a register to be spilled in
17629 one mode and reloaded in the other, which handles the
17630 endianness correctly. */
17631 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
17637 /* Implement TARGET_EARLY_REMAT_MODES. */
17640 aarch64_select_early_remat_modes (sbitmap modes
)
17642 /* SVE values are not normally live across a call, so it should be
17643 worth doing early rematerialization even in VL-specific mode. */
17644 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
17646 machine_mode mode
= (machine_mode
) i
;
17647 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17648 if (vec_flags
& VEC_ANY_SVE
)
17649 bitmap_set_bit (modes
, i
);
17653 /* Target-specific selftests. */
17657 namespace selftest
{
17659 /* Selftest for the RTL loader.
17660 Verify that the RTL loader copes with a dump from
17661 print_rtx_function. This is essentially just a test that class
17662 function_reader can handle a real dump, but it also verifies
17663 that lookup_reg_by_dump_name correctly handles hard regs.
17664 The presence of hard reg names in the dump means that the test is
17665 target-specific, hence it is in this file. */
17668 aarch64_test_loading_full_dump ()
17670 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17672 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17674 rtx_insn
*insn_1
= get_insn_by_uid (1);
17675 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17677 rtx_insn
*insn_15
= get_insn_by_uid (15);
17678 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17679 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17681 /* Verify crtl->return_rtx. */
17682 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17683 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17684 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17687 /* Run all target-specific selftests. */
17690 aarch64_run_selftests (void)
17692 aarch64_test_loading_full_dump ();
17695 } // namespace selftest
17697 #endif /* #if CHECKING_P */
17699 #undef TARGET_ADDRESS_COST
17700 #define TARGET_ADDRESS_COST aarch64_address_cost
17702 /* This hook will determines whether unnamed bitfields affect the alignment
17703 of the containing structure. The hook returns true if the structure
17704 should inherit the alignment requirements of an unnamed bitfield's
17706 #undef TARGET_ALIGN_ANON_BITFIELD
17707 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17709 #undef TARGET_ASM_ALIGNED_DI_OP
17710 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17712 #undef TARGET_ASM_ALIGNED_HI_OP
17713 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17715 #undef TARGET_ASM_ALIGNED_SI_OP
17716 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17718 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17719 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17720 hook_bool_const_tree_hwi_hwi_const_tree_true
17722 #undef TARGET_ASM_FILE_START
17723 #define TARGET_ASM_FILE_START aarch64_start_file
17725 #undef TARGET_ASM_OUTPUT_MI_THUNK
17726 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17728 #undef TARGET_ASM_SELECT_RTX_SECTION
17729 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17731 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17732 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17734 #undef TARGET_BUILD_BUILTIN_VA_LIST
17735 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17737 #undef TARGET_CALLEE_COPIES
17738 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17740 #undef TARGET_CAN_ELIMINATE
17741 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17743 #undef TARGET_CAN_INLINE_P
17744 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17746 #undef TARGET_CANNOT_FORCE_CONST_MEM
17747 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17749 #undef TARGET_CASE_VALUES_THRESHOLD
17750 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17752 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17753 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17755 /* Only the least significant bit is used for initialization guard
17757 #undef TARGET_CXX_GUARD_MASK_BIT
17758 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17760 #undef TARGET_C_MODE_FOR_SUFFIX
17761 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17763 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17764 #undef TARGET_DEFAULT_TARGET_FLAGS
17765 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17768 #undef TARGET_CLASS_MAX_NREGS
17769 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17771 #undef TARGET_BUILTIN_DECL
17772 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17774 #undef TARGET_BUILTIN_RECIPROCAL
17775 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17777 #undef TARGET_C_EXCESS_PRECISION
17778 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17780 #undef TARGET_EXPAND_BUILTIN
17781 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17783 #undef TARGET_EXPAND_BUILTIN_VA_START
17784 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17786 #undef TARGET_FOLD_BUILTIN
17787 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17789 #undef TARGET_FUNCTION_ARG
17790 #define TARGET_FUNCTION_ARG aarch64_function_arg
17792 #undef TARGET_FUNCTION_ARG_ADVANCE
17793 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17795 #undef TARGET_FUNCTION_ARG_BOUNDARY
17796 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17798 #undef TARGET_FUNCTION_ARG_PADDING
17799 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17801 #undef TARGET_GET_RAW_RESULT_MODE
17802 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17803 #undef TARGET_GET_RAW_ARG_MODE
17804 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17806 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17807 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17809 #undef TARGET_FUNCTION_VALUE
17810 #define TARGET_FUNCTION_VALUE aarch64_function_value
17812 #undef TARGET_FUNCTION_VALUE_REGNO_P
17813 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17815 #undef TARGET_GIMPLE_FOLD_BUILTIN
17816 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17818 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17819 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17821 #undef TARGET_INIT_BUILTINS
17822 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17824 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17825 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17826 aarch64_ira_change_pseudo_allocno_class
17828 #undef TARGET_LEGITIMATE_ADDRESS_P
17829 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17831 #undef TARGET_LEGITIMATE_CONSTANT_P
17832 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17834 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17835 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17836 aarch64_legitimize_address_displacement
17838 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17839 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17841 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17842 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17843 aarch64_libgcc_floating_mode_supported_p
17845 #undef TARGET_MANGLE_TYPE
17846 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17848 #undef TARGET_MEMORY_MOVE_COST
17849 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17851 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17852 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17854 #undef TARGET_MUST_PASS_IN_STACK
17855 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17857 /* This target hook should return true if accesses to volatile bitfields
17858 should use the narrowest mode possible. It should return false if these
17859 accesses should use the bitfield container type. */
17860 #undef TARGET_NARROW_VOLATILE_BITFIELD
17861 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17863 #undef TARGET_OPTION_OVERRIDE
17864 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17866 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17867 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17868 aarch64_override_options_after_change
17870 #undef TARGET_OPTION_SAVE
17871 #define TARGET_OPTION_SAVE aarch64_option_save
17873 #undef TARGET_OPTION_RESTORE
17874 #define TARGET_OPTION_RESTORE aarch64_option_restore
17876 #undef TARGET_OPTION_PRINT
17877 #define TARGET_OPTION_PRINT aarch64_option_print
17879 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17880 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17882 #undef TARGET_SET_CURRENT_FUNCTION
17883 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17885 #undef TARGET_PASS_BY_REFERENCE
17886 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17888 #undef TARGET_PREFERRED_RELOAD_CLASS
17889 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17891 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17892 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17894 #undef TARGET_PROMOTED_TYPE
17895 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17897 #undef TARGET_SECONDARY_RELOAD
17898 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17900 #undef TARGET_SHIFT_TRUNCATION_MASK
17901 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17903 #undef TARGET_SETUP_INCOMING_VARARGS
17904 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17906 #undef TARGET_STRUCT_VALUE_RTX
17907 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17909 #undef TARGET_REGISTER_MOVE_COST
17910 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17912 #undef TARGET_RETURN_IN_MEMORY
17913 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17915 #undef TARGET_RETURN_IN_MSB
17916 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17918 #undef TARGET_RTX_COSTS
17919 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17921 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17922 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17924 #undef TARGET_SCHED_ISSUE_RATE
17925 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17927 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17928 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17929 aarch64_sched_first_cycle_multipass_dfa_lookahead
17931 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17932 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17933 aarch64_first_cycle_multipass_dfa_lookahead_guard
17935 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17936 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17937 aarch64_get_separate_components
17939 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17940 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17941 aarch64_components_for_bb
17943 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17944 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17945 aarch64_disqualify_components
17947 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17948 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17949 aarch64_emit_prologue_components
17951 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17952 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17953 aarch64_emit_epilogue_components
17955 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17956 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17957 aarch64_set_handled_components
17959 #undef TARGET_TRAMPOLINE_INIT
17960 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17962 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17963 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17965 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17966 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17968 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17969 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17970 aarch64_builtin_support_vector_misalignment
17972 #undef TARGET_ARRAY_MODE
17973 #define TARGET_ARRAY_MODE aarch64_array_mode
17975 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17976 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17978 #undef TARGET_VECTORIZE_ADD_STMT_COST
17979 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17981 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17982 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17983 aarch64_builtin_vectorization_cost
17985 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17986 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17988 #undef TARGET_VECTORIZE_BUILTINS
17989 #define TARGET_VECTORIZE_BUILTINS
17991 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17992 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17993 aarch64_builtin_vectorized_function
17995 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17996 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17997 aarch64_autovectorize_vector_sizes
17999 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18000 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18001 aarch64_atomic_assign_expand_fenv
18003 /* Section anchor support. */
18005 #undef TARGET_MIN_ANCHOR_OFFSET
18006 #define TARGET_MIN_ANCHOR_OFFSET -256
18008 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18009 byte offset; we can do much more for larger data types, but have no way
18010 to determine the size of the access. We assume accesses are aligned. */
18011 #undef TARGET_MAX_ANCHOR_OFFSET
18012 #define TARGET_MAX_ANCHOR_OFFSET 4095
18014 #undef TARGET_VECTOR_ALIGNMENT
18015 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18017 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18018 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18019 aarch64_vectorize_preferred_vector_alignment
18020 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18021 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18022 aarch64_simd_vector_alignment_reachable
18024 /* vec_perm support. */
18026 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18027 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18028 aarch64_vectorize_vec_perm_const
18030 #undef TARGET_VECTORIZE_GET_MASK_MODE
18031 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18032 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18033 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18034 aarch64_empty_mask_is_expensive
18036 #undef TARGET_INIT_LIBFUNCS
18037 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18039 #undef TARGET_FIXED_CONDITION_CODE_REGS
18040 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18042 #undef TARGET_FLAGS_REGNUM
18043 #define TARGET_FLAGS_REGNUM CC_REGNUM
18045 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18046 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18048 #undef TARGET_ASAN_SHADOW_OFFSET
18049 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18051 #undef TARGET_LEGITIMIZE_ADDRESS
18052 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18054 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18055 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18057 #undef TARGET_CAN_USE_DOLOOP_P
18058 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18060 #undef TARGET_SCHED_ADJUST_PRIORITY
18061 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18063 #undef TARGET_SCHED_MACRO_FUSION_P
18064 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18066 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18067 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18069 #undef TARGET_SCHED_FUSION_PRIORITY
18070 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18072 #undef TARGET_UNSPEC_MAY_TRAP_P
18073 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18075 #undef TARGET_USE_PSEUDO_PIC_REG
18076 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18078 #undef TARGET_PRINT_OPERAND
18079 #define TARGET_PRINT_OPERAND aarch64_print_operand
18081 #undef TARGET_PRINT_OPERAND_ADDRESS
18082 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18084 #undef TARGET_OPTAB_SUPPORTED_P
18085 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18087 #undef TARGET_OMIT_STRUCT_RETURN_REG
18088 #define TARGET_OMIT_STRUCT_RETURN_REG true
18090 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18091 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18092 aarch64_dwarf_poly_indeterminate_value
18094 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18095 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18096 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18098 #undef TARGET_HARD_REGNO_NREGS
18099 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18100 #undef TARGET_HARD_REGNO_MODE_OK
18101 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18103 #undef TARGET_MODES_TIEABLE_P
18104 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18106 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18107 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18108 aarch64_hard_regno_call_part_clobbered
18110 #undef TARGET_CONSTANT_ALIGNMENT
18111 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18113 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18114 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18116 #undef TARGET_CAN_CHANGE_MODE_CLASS
18117 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18119 #undef TARGET_SELECT_EARLY_REMAT_MODES
18120 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18123 #undef TARGET_RUN_TARGET_SELFTESTS
18124 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18125 #endif /* #if CHECKING_P */
18127 struct gcc_target targetm
= TARGET_INITIALIZER
;
18129 #include "gt-aarch64.h"