1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
76 #include "function-abi.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
88 enum modifier_type
{ LSL
, MSL
};
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode
, rtx
);
92 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
93 insn_type
= MOV
, modifier_type
= LSL
,
95 simd_immediate_info (scalar_mode
, rtx
, rtx
);
96 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
98 /* The mode of the elements. */
101 /* The instruction to use to move the immediate into a vector. */
106 /* For MOV and MVN. */
109 /* The value of each element. */
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier
;
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
127 aarch64_svpattern pattern
;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
135 : elt_mode (elt_mode_in
), insn (MOV
)
137 u
.mov
.value
= value_in
;
138 u
.mov
.modifier
= LSL
;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
147 unsigned HOST_WIDE_INT value_in
,
148 insn_type insn_in
, modifier_type modifier_in
,
149 unsigned int shift_in
)
150 : elt_mode (elt_mode_in
), insn (insn_in
)
152 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
153 u
.mov
.modifier
= modifier_in
;
154 u
.mov
.shift
= shift_in
;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
161 : elt_mode (elt_mode_in
), insn (INDEX
)
163 u
.index
.base
= base_in
;
164 u
.index
.step
= step_in
;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
171 aarch64_svpattern pattern_in
)
172 : elt_mode (elt_mode_in
), insn (PTRUE
)
174 u
.pattern
= pattern_in
;
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel
;
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg
;
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
188 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
191 machine_mode
*, int *,
193 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode
);
197 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
202 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
203 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
204 aarch64_addr_query_type
);
205 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version
;
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune
= cortexa53
;
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags
= 0;
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads
;
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer
;
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string
= NULL
;
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
228 /* Support for command line parsing of boolean flags in the tuning
230 struct aarch64_flag_desc
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
240 { "none", AARCH64_FUSE_NOTHING
},
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL
},
243 { NULL
, AARCH64_FUSE_NOTHING
}
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
250 { "none", AARCH64_EXTRA_TUNE_NONE
},
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL
},
253 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
256 /* Tuning parameters. */
258 static const struct cpu_addrcost_table generic_addrcost_table
=
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
274 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
290 static const struct cpu_addrcost_table xgene1_addrcost_table
=
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
322 static const struct cpu_addrcost_table tsv110_addrcost_table
=
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
354 static const struct cpu_regmove_cost generic_regmove_cost
=
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
364 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
374 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
384 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
394 static const struct cpu_regmove_cost thunderx_regmove_cost
=
402 static const struct cpu_regmove_cost xgene1_regmove_cost
=
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
415 /* Avoid the use of int<->fp moves for spilling. */
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
424 /* Avoid the use of int<->fp moves for spilling. */
430 static const struct cpu_regmove_cost tsv110_regmove_cost
=
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost
=
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 2, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost
=
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost
=
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
500 static const struct cpu_vector_cost tsv110_vector_cost
=
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost
=
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
539 static const struct cpu_vector_cost exynosm1_vector_cost
=
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost
=
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 10, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost
=
601 1, /* Predictable. */
602 3 /* Unpredictable. */
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes
=
608 AARCH64_APPROX_NONE
, /* division */
609 AARCH64_APPROX_NONE
, /* sqrt */
610 AARCH64_APPROX_NONE
/* recip_sqrt */
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes
=
616 AARCH64_APPROX_NONE
, /* division */
617 AARCH64_APPROX_ALL
, /* sqrt */
618 AARCH64_APPROX_ALL
/* recip_sqrt */
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes
=
624 AARCH64_APPROX_NONE
, /* division */
625 AARCH64_APPROX_NONE
, /* sqrt */
626 AARCH64_APPROX_ALL
/* recip_sqrt */
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune
=
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
641 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
674 static const cpu_prefetch_tune thunderx_prefetch_tune
=
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
696 static const cpu_prefetch_tune tsv110_prefetch_tune
=
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
707 static const cpu_prefetch_tune xgene1_prefetch_tune
=
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
718 static const struct tune_params generic_tunings
=
720 &cortexa57_extra_costs
,
721 &generic_addrcost_table
,
722 &generic_regmove_cost
,
723 &generic_vector_cost
,
724 &generic_branch_cost
,
725 &generic_approx_modes
,
726 SVE_NOT_IMPLEMENTED
, /* sve_width */
729 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
741 &generic_prefetch_tune
744 static const struct tune_params cortexa35_tunings
=
746 &cortexa53_extra_costs
,
747 &generic_addrcost_table
,
748 &cortexa53_regmove_cost
,
749 &generic_vector_cost
,
750 &generic_branch_cost
,
751 &generic_approx_modes
,
752 SVE_NOT_IMPLEMENTED
, /* sve_width */
755 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
768 &generic_prefetch_tune
771 static const struct tune_params cortexa53_tunings
=
773 &cortexa53_extra_costs
,
774 &generic_addrcost_table
,
775 &cortexa53_regmove_cost
,
776 &generic_vector_cost
,
777 &generic_branch_cost
,
778 &generic_approx_modes
,
779 SVE_NOT_IMPLEMENTED
, /* sve_width */
782 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
795 &generic_prefetch_tune
798 static const struct tune_params cortexa57_tunings
=
800 &cortexa57_extra_costs
,
801 &generic_addrcost_table
,
802 &cortexa57_regmove_cost
,
803 &cortexa57_vector_cost
,
804 &generic_branch_cost
,
805 &generic_approx_modes
,
806 SVE_NOT_IMPLEMENTED
, /* sve_width */
809 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
822 &generic_prefetch_tune
825 static const struct tune_params cortexa72_tunings
=
827 &cortexa57_extra_costs
,
828 &generic_addrcost_table
,
829 &cortexa57_regmove_cost
,
830 &cortexa57_vector_cost
,
831 &generic_branch_cost
,
832 &generic_approx_modes
,
833 SVE_NOT_IMPLEMENTED
, /* sve_width */
836 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
849 &generic_prefetch_tune
852 static const struct tune_params cortexa73_tunings
=
854 &cortexa57_extra_costs
,
855 &generic_addrcost_table
,
856 &cortexa57_regmove_cost
,
857 &cortexa57_vector_cost
,
858 &generic_branch_cost
,
859 &generic_approx_modes
,
860 SVE_NOT_IMPLEMENTED
, /* sve_width */
861 4, /* memmov_cost. */
863 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
876 &generic_prefetch_tune
881 static const struct tune_params exynosm1_tunings
=
883 &exynosm1_extra_costs
,
884 &exynosm1_addrcost_table
,
885 &exynosm1_regmove_cost
,
886 &exynosm1_vector_cost
,
887 &generic_branch_cost
,
888 &exynosm1_approx_modes
,
889 SVE_NOT_IMPLEMENTED
, /* sve_width */
892 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
904 &exynosm1_prefetch_tune
907 static const struct tune_params thunderxt88_tunings
=
909 &thunderx_extra_costs
,
910 &generic_addrcost_table
,
911 &thunderx_regmove_cost
,
912 &thunderx_vector_cost
,
913 &generic_branch_cost
,
914 &generic_approx_modes
,
915 SVE_NOT_IMPLEMENTED
, /* sve_width */
918 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
930 &thunderxt88_prefetch_tune
933 static const struct tune_params thunderx_tunings
=
935 &thunderx_extra_costs
,
936 &generic_addrcost_table
,
937 &thunderx_regmove_cost
,
938 &thunderx_vector_cost
,
939 &generic_branch_cost
,
940 &generic_approx_modes
,
941 SVE_NOT_IMPLEMENTED
, /* sve_width */
944 AARCH64_FUSE_ALU_BRANCH
, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
957 &thunderx_prefetch_tune
960 static const struct tune_params tsv110_tunings
=
963 &tsv110_addrcost_table
,
964 &tsv110_regmove_cost
,
966 &generic_branch_cost
,
967 &generic_approx_modes
,
968 SVE_NOT_IMPLEMENTED
, /* sve_width */
971 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_ALU_BRANCH
972 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
984 &tsv110_prefetch_tune
987 static const struct tune_params xgene1_tunings
=
990 &xgene1_addrcost_table
,
991 &xgene1_regmove_cost
,
993 &generic_branch_cost
,
994 &xgene1_approx_modes
,
995 SVE_NOT_IMPLEMENTED
, /* sve_width */
998 AARCH64_FUSE_NOTHING
, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1010 &xgene1_prefetch_tune
1013 static const struct tune_params emag_tunings
=
1015 &xgene1_extra_costs
,
1016 &xgene1_addrcost_table
,
1017 &xgene1_regmove_cost
,
1018 &xgene1_vector_cost
,
1019 &generic_branch_cost
,
1020 &xgene1_approx_modes
,
1021 SVE_NOT_IMPLEMENTED
,
1022 6, /* memmov_cost */
1024 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1036 &xgene1_prefetch_tune
1039 static const struct tune_params qdf24xx_tunings
=
1041 &qdf24xx_extra_costs
,
1042 &qdf24xx_addrcost_table
,
1043 &qdf24xx_regmove_cost
,
1044 &qdf24xx_vector_cost
,
1045 &generic_branch_cost
,
1046 &generic_approx_modes
,
1047 SVE_NOT_IMPLEMENTED
, /* sve_width */
1048 4, /* memmov_cost */
1050 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 static const struct tune_params saphira_tunings
=
1070 &generic_extra_costs
,
1071 &generic_addrcost_table
,
1072 &generic_regmove_cost
,
1073 &generic_vector_cost
,
1074 &generic_branch_cost
,
1075 &generic_approx_modes
,
1076 SVE_NOT_IMPLEMENTED
, /* sve_width */
1077 4, /* memmov_cost */
1079 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1092 &generic_prefetch_tune
1095 static const struct tune_params thunderx2t99_tunings
=
1097 &thunderx2t99_extra_costs
,
1098 &thunderx2t99_addrcost_table
,
1099 &thunderx2t99_regmove_cost
,
1100 &thunderx2t99_vector_cost
,
1101 &generic_branch_cost
,
1102 &generic_approx_modes
,
1103 SVE_NOT_IMPLEMENTED
, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_ALU_BRANCH
| AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_CBZ
), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1122 static const struct tune_params neoversen1_tunings
=
1124 &cortexa57_extra_costs
,
1125 &generic_addrcost_table
,
1126 &generic_regmove_cost
,
1127 &cortexa57_vector_cost
,
1128 &generic_branch_cost
,
1129 &generic_approx_modes
,
1130 SVE_NOT_IMPLEMENTED
, /* sve_width */
1131 4, /* memmov_cost */
1133 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
), /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "4", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1145 &generic_prefetch_tune
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1152 void (*parse_override
)(const char*, struct tune_params
*);
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions
[] =
1162 { "fuse", aarch64_parse_fuse_string
},
1163 { "tune", aarch64_parse_tune_string
},
1164 { "sve_width", aarch64_parse_sve_width_string
},
1168 /* A processor implementing AArch64. */
1171 const char *const name
;
1172 enum aarch64_processor ident
;
1173 enum aarch64_processor sched_core
;
1174 enum aarch64_arch arch
;
1175 unsigned architecture_version
;
1176 const uint64_t flags
;
1177 const struct tune_params
*const tune
;
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures
[] =
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores
[] =
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1198 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1199 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor
*selected_arch
;
1206 static const struct processor
*selected_cpu
;
1207 static const struct processor
*selected_tune
;
1209 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params
= generic_tunings
;
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1217 handle_aarch64_vector_pcs_attribute (tree
*node
, tree name
, tree
,
1218 int, bool *no_add_attrs
)
1220 /* Since we set fn_type_req to true, the caller should have checked
1222 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node
));
1223 switch ((arm_pcs
) fntype_abi (*node
).id ())
1225 case ARM_PCS_AAPCS64
:
1230 error ("the %qE attribute cannot be applied to an SVE function type",
1232 *no_add_attrs
= true;
1235 case ARM_PCS_TLSDESC
:
1236 case ARM_PCS_UNKNOWN
:
1242 /* Table of machine attributes. */
1243 static const struct attribute_spec aarch64_attribute_table
[] =
1245 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246 affects_type_identity, handler, exclude } */
1247 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1248 handle_aarch64_vector_pcs_attribute
, NULL
},
1249 { "SVE type", 3, 3, false, true, false, true, NULL
, NULL
},
1250 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1255 /* An ISA extension in the co-processor and main instruction set space. */
1256 struct aarch64_option_extension
1258 const char *const name
;
1259 const unsigned long flags_on
;
1260 const unsigned long flags_off
;
1263 typedef enum aarch64_cond_code
1265 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1266 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1267 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1273 struct aarch64_branch_protect_type
1275 /* The type's name that the user passes to the branch-protection option
1278 /* Function to handle the protection type and set global variables.
1279 First argument is the string token corresponding with this type and the
1280 second argument is the next token in the option string.
1282 * AARCH64_PARSE_OK: Handling was sucessful.
1283 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284 should print an error.
1285 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1287 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1288 /* A list of types that can follow this type in the option string. */
1289 const aarch64_branch_protect_type
* subtypes
;
1290 unsigned int num_subtypes
;
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1296 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1297 aarch64_enable_bti
= 0;
1300 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1301 return AARCH64_PARSE_INVALID_FEATURE
;
1303 return AARCH64_PARSE_OK
;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1309 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1310 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1311 aarch64_enable_bti
= 1;
1314 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1315 return AARCH64_PARSE_INVALID_FEATURE
;
1317 return AARCH64_PARSE_OK
;
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1322 char* rest ATTRIBUTE_UNUSED
)
1324 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1325 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1326 return AARCH64_PARSE_OK
;
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1331 char* rest ATTRIBUTE_UNUSED
)
1333 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1334 return AARCH64_PARSE_OK
;
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1339 char* rest ATTRIBUTE_UNUSED
)
1341 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1342 return AARCH64_PARSE_OK
;
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1347 char* rest ATTRIBUTE_UNUSED
)
1349 aarch64_enable_bti
= 1;
1350 return AARCH64_PARSE_OK
;
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1354 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1355 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1356 { NULL
, NULL
, NULL
, 0 }
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1360 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1361 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1362 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1363 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1364 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1365 { NULL
, NULL
, NULL
, 0 }
1368 /* The condition codes of the processor, and the inverse function. */
1369 static const char * const aarch64_condition_codes
[] =
1371 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1375 /* The preferred condition codes for SVE conditions. */
1376 static const char *const aarch64_sve_condition_codes
[] =
1378 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1382 /* Return the assembly token for svpattern value VALUE. */
1385 svpattern_token (enum aarch64_svpattern pattern
)
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390 AARCH64_FOR_SVPATTERN (CASE
)
1392 case AARCH64_NUM_SVPATTERNS
:
1398 /* Return the descriptor of the SIMD ABI. */
1400 static const predefined_function_abi
&
1401 aarch64_simd_abi (void)
1403 predefined_function_abi
&simd_abi
= function_abis
[ARM_PCS_SIMD
];
1404 if (!simd_abi
.initialized_p ())
1406 HARD_REG_SET full_reg_clobbers
1407 = default_function_abi
.full_reg_clobbers ();
1408 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1409 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1410 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1411 simd_abi
.initialize (ARM_PCS_SIMD
, full_reg_clobbers
);
1416 /* Return the descriptor of the SVE PCS. */
1418 static const predefined_function_abi
&
1419 aarch64_sve_abi (void)
1421 predefined_function_abi
&sve_abi
= function_abis
[ARM_PCS_SVE
];
1422 if (!sve_abi
.initialized_p ())
1424 HARD_REG_SET full_reg_clobbers
1425 = default_function_abi
.full_reg_clobbers ();
1426 for (int regno
= V8_REGNUM
; regno
<= V23_REGNUM
; ++regno
)
1427 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1428 for (int regno
= P4_REGNUM
; regno
<= P11_REGNUM
; ++regno
)
1429 CLEAR_HARD_REG_BIT (full_reg_clobbers
, regno
);
1430 sve_abi
.initialize (ARM_PCS_SVE
, full_reg_clobbers
);
1435 /* Generate code to enable conditional branches in functions over 1 MiB. */
1437 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1438 const char * branch_format
)
1440 rtx_code_label
* tmp_label
= gen_label_rtx ();
1441 char label_buf
[256];
1443 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1444 CODE_LABEL_NUMBER (tmp_label
));
1445 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1446 rtx dest_label
= operands
[pos_label
];
1447 operands
[pos_label
] = tmp_label
;
1449 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1450 output_asm_insn (buffer
, operands
);
1452 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1453 operands
[pos_label
] = dest_label
;
1454 output_asm_insn (buffer
, operands
);
1459 aarch64_err_no_fpadvsimd (machine_mode mode
)
1461 if (TARGET_GENERAL_REGS_ONLY
)
1462 if (FLOAT_MODE_P (mode
))
1463 error ("%qs is incompatible with the use of floating-point types",
1464 "-mgeneral-regs-only");
1466 error ("%qs is incompatible with the use of vector types",
1467 "-mgeneral-regs-only");
1469 if (FLOAT_MODE_P (mode
))
1470 error ("%qs feature modifier is incompatible with the use of"
1471 " floating-point types", "+nofp");
1473 error ("%qs feature modifier is incompatible with the use of"
1474 " vector types", "+nofp");
1477 /* Report when we try to do something that requires SVE when SVE is disabled.
1478 This is an error of last resort and isn't very high-quality. It usually
1479 involves attempts to measure the vector length in some way. */
1481 aarch64_report_sve_required (void)
1483 static bool reported_p
= false;
1485 /* Avoid reporting a slew of messages for a single oversight. */
1489 error ("this operation requires the SVE ISA extension");
1490 inform (input_location
, "you can enable SVE using the command-line"
1491 " option %<-march%>, or by using the %<target%>"
1492 " attribute or pragma");
1496 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1499 pr_or_ffr_regnum_p (unsigned int regno
)
1501 return PR_REGNUM_P (regno
) || regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
;
1504 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1505 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1506 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1507 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1508 and GENERAL_REGS is lower than the memory cost (in this case the best class
1509 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1510 cost results in bad allocations with many redundant int<->FP moves which
1511 are expensive on various cores.
1512 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1513 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1514 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1515 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1516 The result of this is that it is no longer inefficient to have a higher
1517 memory move cost than the register move cost.
1521 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1522 reg_class_t best_class
)
1526 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1527 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1528 return allocno_class
;
1530 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1531 || !reg_class_subset_p (FP_REGS
, best_class
))
1534 mode
= PSEUDO_REGNO_MODE (regno
);
1535 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1539 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1541 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1542 return aarch64_tune_params
.min_div_recip_mul_sf
;
1543 return aarch64_tune_params
.min_div_recip_mul_df
;
1546 /* Return the reassociation width of treeop OPC with mode MODE. */
1548 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1550 if (VECTOR_MODE_P (mode
))
1551 return aarch64_tune_params
.vec_reassoc_width
;
1552 if (INTEGRAL_MODE_P (mode
))
1553 return aarch64_tune_params
.int_reassoc_width
;
1554 /* Avoid reassociating floating point addition so we emit more FMAs. */
1555 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1556 return aarch64_tune_params
.fp_reassoc_width
;
1560 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1562 aarch64_dbx_register_number (unsigned regno
)
1564 if (GP_REGNUM_P (regno
))
1565 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1566 else if (regno
== SP_REGNUM
)
1567 return AARCH64_DWARF_SP
;
1568 else if (FP_REGNUM_P (regno
))
1569 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1570 else if (PR_REGNUM_P (regno
))
1571 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1572 else if (regno
== VG_REGNUM
)
1573 return AARCH64_DWARF_VG
;
1575 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1576 equivalent DWARF register. */
1577 return DWARF_FRAME_REGISTERS
;
1580 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1581 integer, otherwise return X unmodified. */
1583 aarch64_bit_representation (rtx x
)
1585 if (CONST_DOUBLE_P (x
))
1586 x
= gen_lowpart (int_mode_for_mode (GET_MODE (x
)).require (), x
);
1590 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1592 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1595 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1598 /* Return true if MODE is an SVE predicate mode. */
1600 aarch64_sve_pred_mode_p (machine_mode mode
)
1603 && (mode
== VNx16BImode
1604 || mode
== VNx8BImode
1605 || mode
== VNx4BImode
1606 || mode
== VNx2BImode
));
1609 /* Three mutually-exclusive flags describing a vector or predicate type. */
1610 const unsigned int VEC_ADVSIMD
= 1;
1611 const unsigned int VEC_SVE_DATA
= 2;
1612 const unsigned int VEC_SVE_PRED
= 4;
1613 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1614 a structure of 2, 3 or 4 vectors. */
1615 const unsigned int VEC_STRUCT
= 8;
1616 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1617 vector has fewer significant bytes than a full SVE vector. */
1618 const unsigned int VEC_PARTIAL
= 16;
1619 /* Useful combinations of the above. */
1620 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1621 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1623 /* Return a set of flags describing the vector properties of mode MODE.
1624 Ignore modes that are not supported by the current target. */
1626 aarch64_classify_vector_mode (machine_mode mode
)
1628 if (aarch64_advsimd_struct_mode_p (mode
))
1629 return VEC_ADVSIMD
| VEC_STRUCT
;
1631 if (aarch64_sve_pred_mode_p (mode
))
1632 return VEC_SVE_PRED
;
1634 /* Make the decision based on the mode's enum value rather than its
1635 properties, so that we keep the correct classification regardless
1636 of -msve-vector-bits. */
1639 /* Partial SVE QI vectors. */
1643 /* Partial SVE HI vectors. */
1646 /* Partial SVE SI vector. */
1648 /* Partial SVE HF vectors. */
1651 /* Partial SVE SF vector. */
1653 return TARGET_SVE
? VEC_SVE_DATA
| VEC_PARTIAL
: 0;
1662 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1664 /* x2 SVE vectors. */
1672 /* x3 SVE vectors. */
1680 /* x4 SVE vectors. */
1688 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1690 /* 64-bit Advanced SIMD vectors. */
1694 /* ...E_V1DImode doesn't exist. */
1699 /* 128-bit Advanced SIMD vectors. */
1708 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1715 /* Return true if MODE is any of the data vector modes, including
1718 aarch64_vector_data_mode_p (machine_mode mode
)
1720 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1723 /* Return true if MODE is any form of SVE mode, including predicates,
1724 vectors and structures. */
1726 aarch64_sve_mode_p (machine_mode mode
)
1728 return aarch64_classify_vector_mode (mode
) & VEC_ANY_SVE
;
1731 /* Return true if MODE is an SVE data vector mode; either a single vector
1732 or a structure of vectors. */
1734 aarch64_sve_data_mode_p (machine_mode mode
)
1736 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1739 /* Return the number of defined bytes in one constituent vector of
1740 SVE mode MODE, which has vector flags VEC_FLAGS. */
1742 aarch64_vl_bytes (machine_mode mode
, unsigned int vec_flags
)
1744 if (vec_flags
& VEC_PARTIAL
)
1745 /* A single partial vector. */
1746 return GET_MODE_SIZE (mode
);
1748 if (vec_flags
& VEC_SVE_DATA
)
1749 /* A single vector or a tuple. */
1750 return BYTES_PER_SVE_VECTOR
;
1752 /* A single predicate. */
1753 gcc_assert (vec_flags
& VEC_SVE_PRED
);
1754 return BYTES_PER_SVE_PRED
;
1757 /* Implement target hook TARGET_ARRAY_MODE. */
1758 static opt_machine_mode
1759 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1761 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1762 && IN_RANGE (nelems
, 2, 4))
1763 return mode_for_vector (GET_MODE_INNER (mode
),
1764 GET_MODE_NUNITS (mode
) * nelems
);
1766 return opt_machine_mode ();
1769 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1771 aarch64_array_mode_supported_p (machine_mode mode
,
1772 unsigned HOST_WIDE_INT nelems
)
1775 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1776 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1777 && (nelems
>= 2 && nelems
<= 4))
1783 /* MODE is some form of SVE vector mode. For data modes, return the number
1784 of vector register bits that each element of MODE occupies, such as 64
1785 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1786 in a 64-bit container). For predicate modes, return the number of
1787 data bits controlled by each significant predicate bit. */
1790 aarch64_sve_container_bits (machine_mode mode
)
1792 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1793 poly_uint64 vector_bits
= (vec_flags
& (VEC_PARTIAL
| VEC_SVE_PRED
)
1794 ? BITS_PER_SVE_VECTOR
1795 : GET_MODE_BITSIZE (mode
));
1796 return vector_element_size (vector_bits
, GET_MODE_NUNITS (mode
));
1799 /* Return the SVE predicate mode to use for elements that have
1800 ELEM_NBYTES bytes, if such a mode exists. */
1803 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1807 if (elem_nbytes
== 1)
1809 if (elem_nbytes
== 2)
1811 if (elem_nbytes
== 4)
1813 if (elem_nbytes
== 8)
1816 return opt_machine_mode ();
1819 /* Return the SVE predicate mode that should be used to control
1823 aarch64_sve_pred_mode (machine_mode mode
)
1825 unsigned int bits
= aarch64_sve_container_bits (mode
);
1826 return aarch64_sve_pred_mode (bits
/ BITS_PER_UNIT
).require ();
1829 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1831 static opt_machine_mode
1832 aarch64_get_mask_mode (machine_mode mode
)
1834 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1835 if (vec_flags
& VEC_SVE_DATA
)
1836 return aarch64_sve_pred_mode (mode
);
1838 return default_get_mask_mode (mode
);
1841 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1844 aarch64_sve_data_mode (scalar_mode inner_mode
, poly_uint64 nunits
)
1846 enum mode_class mclass
= (is_a
<scalar_float_mode
> (inner_mode
)
1847 ? MODE_VECTOR_FLOAT
: MODE_VECTOR_INT
);
1849 FOR_EACH_MODE_IN_CLASS (mode
, mclass
)
1850 if (inner_mode
== GET_MODE_INNER (mode
)
1851 && known_eq (nunits
, GET_MODE_NUNITS (mode
))
1852 && aarch64_sve_data_mode_p (mode
))
1854 return opt_machine_mode ();
1857 /* Return the integer element mode associated with SVE mode MODE. */
1859 static scalar_int_mode
1860 aarch64_sve_element_int_mode (machine_mode mode
)
1862 poly_uint64 vector_bits
= (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
1863 ? BITS_PER_SVE_VECTOR
1864 : GET_MODE_BITSIZE (mode
));
1865 unsigned int elt_bits
= vector_element_size (vector_bits
,
1866 GET_MODE_NUNITS (mode
));
1867 return int_mode_for_size (elt_bits
, 0).require ();
1870 /* Return an integer element mode that contains exactly
1871 aarch64_sve_container_bits (MODE) bits. This is wider than
1872 aarch64_sve_element_int_mode if MODE is a partial vector,
1873 otherwise it's the same. */
1875 static scalar_int_mode
1876 aarch64_sve_container_int_mode (machine_mode mode
)
1878 return int_mode_for_size (aarch64_sve_container_bits (mode
), 0).require ();
1881 /* Return the integer vector mode associated with SVE mode MODE.
1882 Unlike related_int_vector_mode, this can handle the case in which
1883 MODE is a predicate (and thus has a different total size). */
1886 aarch64_sve_int_mode (machine_mode mode
)
1888 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
1889 return aarch64_sve_data_mode (int_mode
, GET_MODE_NUNITS (mode
)).require ();
1892 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1894 static opt_machine_mode
1895 aarch64_vectorize_related_mode (machine_mode vector_mode
,
1896 scalar_mode element_mode
,
1899 unsigned int vec_flags
= aarch64_classify_vector_mode (vector_mode
);
1901 /* If we're operating on SVE vectors, try to return an SVE mode. */
1902 poly_uint64 sve_nunits
;
1903 if ((vec_flags
& VEC_SVE_DATA
)
1904 && multiple_p (BYTES_PER_SVE_VECTOR
,
1905 GET_MODE_SIZE (element_mode
), &sve_nunits
))
1907 machine_mode sve_mode
;
1908 if (maybe_ne (nunits
, 0U))
1910 /* Try to find a full or partial SVE mode with exactly
1912 if (multiple_p (sve_nunits
, nunits
)
1913 && aarch64_sve_data_mode (element_mode
,
1914 nunits
).exists (&sve_mode
))
1919 /* Take the preferred number of units from the number of bytes
1920 that fit in VECTOR_MODE. We always start by "autodetecting"
1921 a full vector mode with preferred_simd_mode, so vectors
1922 chosen here will also be full vector modes. Then
1923 autovectorize_vector_modes tries smaller starting modes
1924 and thus smaller preferred numbers of units. */
1925 sve_nunits
= ordered_min (sve_nunits
, GET_MODE_SIZE (vector_mode
));
1926 if (aarch64_sve_data_mode (element_mode
,
1927 sve_nunits
).exists (&sve_mode
))
1932 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1933 if ((vec_flags
& VEC_ADVSIMD
)
1934 && known_eq (nunits
, 0U)
1935 && known_eq (GET_MODE_BITSIZE (vector_mode
), 64U)
1936 && maybe_ge (GET_MODE_BITSIZE (element_mode
)
1937 * GET_MODE_NUNITS (vector_mode
), 128U))
1939 machine_mode res
= aarch64_simd_container_mode (element_mode
, 128);
1940 if (VECTOR_MODE_P (res
))
1944 return default_vectorize_related_mode (vector_mode
, element_mode
, nunits
);
1947 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1948 prefer to use the first arithmetic operand as the else value if
1949 the else value doesn't matter, since that exactly matches the SVE
1950 destructive merging form. For ternary operations we could either
1951 pick the first operand and use FMAD-like instructions or the last
1952 operand and use FMLA-like instructions; the latter seems more
1956 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1958 return nops
== 3 ? ops
[2] : ops
[0];
1961 /* Implement TARGET_HARD_REGNO_NREGS. */
1964 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1966 /* ??? Logically we should only need to provide a value when
1967 HARD_REGNO_MODE_OK says that the combination is valid,
1968 but at the moment we need to handle all modes. Just ignore
1969 any runtime parts for registers that can't store them. */
1970 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1971 switch (aarch64_regno_regclass (regno
))
1977 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1978 if (vec_flags
& VEC_SVE_DATA
)
1979 return exact_div (GET_MODE_SIZE (mode
),
1980 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
1981 return CEIL (lowest_size
, UNITS_PER_VREG
);
1987 case PR_AND_FFR_REGS
:
1990 return CEIL (lowest_size
, UNITS_PER_WORD
);
1995 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1998 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
2000 if (GET_MODE_CLASS (mode
) == MODE_CC
)
2001 return regno
== CC_REGNUM
;
2003 if (regno
== VG_REGNUM
)
2004 /* This must have the same size as _Unwind_Word. */
2005 return mode
== DImode
;
2007 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2008 if (vec_flags
& VEC_SVE_PRED
)
2009 return pr_or_ffr_regnum_p (regno
);
2011 if (pr_or_ffr_regnum_p (regno
))
2014 if (regno
== SP_REGNUM
)
2015 /* The purpose of comparing with ptr_mode is to support the
2016 global register variable associated with the stack pointer
2017 register via the syntax of asm ("wsp") in ILP32. */
2018 return mode
== Pmode
|| mode
== ptr_mode
;
2020 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
2021 return mode
== Pmode
;
2023 if (GP_REGNUM_P (regno
))
2025 if (vec_flags
& VEC_ANY_SVE
)
2027 if (known_le (GET_MODE_SIZE (mode
), 8))
2029 if (known_le (GET_MODE_SIZE (mode
), 16))
2030 return (regno
& 1) == 0;
2032 else if (FP_REGNUM_P (regno
))
2034 if (vec_flags
& VEC_STRUCT
)
2035 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
2037 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
2043 /* Return true if TYPE is a type that should be passed or returned in
2044 SVE registers, assuming enough registers are available. When returning
2045 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2048 /* Return true if a function with type FNTYPE returns its value in
2049 SVE vector or predicate registers. */
2052 aarch64_returns_value_in_sve_regs_p (const_tree fntype
)
2054 tree return_type
= TREE_TYPE (fntype
);
2055 return (return_type
!= error_mark_node
2056 && aarch64_sve::builtin_type_p (return_type
));
2059 /* Return true if a function with type FNTYPE takes arguments in
2060 SVE vector or predicate registers. */
2063 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype
)
2065 CUMULATIVE_ARGS args_so_far_v
;
2066 aarch64_init_cumulative_args (&args_so_far_v
, NULL_TREE
, NULL_RTX
,
2067 NULL_TREE
, 0, true);
2068 cumulative_args_t args_so_far
= pack_cumulative_args (&args_so_far_v
);
2070 for (tree chain
= TYPE_ARG_TYPES (fntype
);
2071 chain
&& chain
!= void_list_node
;
2072 chain
= TREE_CHAIN (chain
))
2074 tree arg_type
= TREE_VALUE (chain
);
2075 if (arg_type
== error_mark_node
)
2078 function_arg_info
arg (arg_type
, /*named=*/true);
2079 apply_pass_by_reference_rules (&args_so_far_v
, arg
);
2080 if (aarch64_sve::builtin_type_p (arg
.type
))
2083 targetm
.calls
.function_arg_advance (args_so_far
, arg
);
2088 /* Implement TARGET_FNTYPE_ABI. */
2090 static const predefined_function_abi
&
2091 aarch64_fntype_abi (const_tree fntype
)
2093 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)))
2094 return aarch64_simd_abi ();
2096 if (aarch64_returns_value_in_sve_regs_p (fntype
)
2097 || aarch64_takes_arguments_in_sve_regs_p (fntype
))
2098 return aarch64_sve_abi ();
2100 return default_function_abi
;
2103 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2106 aarch64_compatible_vector_types_p (const_tree type1
, const_tree type2
)
2108 return (aarch64_sve::builtin_type_p (type1
)
2109 == aarch64_sve::builtin_type_p (type2
));
2112 /* Return true if we should emit CFI for register REGNO. */
2115 aarch64_emit_cfi_for_reg_p (unsigned int regno
)
2117 return (GP_REGNUM_P (regno
)
2118 || !default_function_abi
.clobbers_full_reg_p (regno
));
2121 /* Return the mode we should use to save and restore register REGNO. */
2124 aarch64_reg_save_mode (unsigned int regno
)
2126 if (GP_REGNUM_P (regno
))
2129 if (FP_REGNUM_P (regno
))
2130 switch (crtl
->abi
->id ())
2132 case ARM_PCS_AAPCS64
:
2133 /* Only the low 64 bits are saved by the base PCS. */
2137 /* The vector PCS saves the low 128 bits (which is the full
2138 register on non-SVE targets). */
2142 /* Use vectors of DImode for registers that need frame
2143 information, so that the first 64 bytes of the save slot
2144 are always the equivalent of what storing D<n> would give. */
2145 if (aarch64_emit_cfi_for_reg_p (regno
))
2148 /* Use vectors of bytes otherwise, so that the layout is
2149 endian-agnostic, and so that we can use LDR and STR for
2150 big-endian targets. */
2153 case ARM_PCS_TLSDESC
:
2154 case ARM_PCS_UNKNOWN
:
2158 if (PR_REGNUM_P (regno
))
2159 /* Save the full predicate register. */
2165 /* Implement TARGET_INSN_CALLEE_ABI. */
2167 const predefined_function_abi
&
2168 aarch64_insn_callee_abi (const rtx_insn
*insn
)
2170 rtx pat
= PATTERN (insn
);
2171 gcc_assert (GET_CODE (pat
) == PARALLEL
);
2172 rtx unspec
= XVECEXP (pat
, 0, 1);
2173 gcc_assert (GET_CODE (unspec
) == UNSPEC
2174 && XINT (unspec
, 1) == UNSPEC_CALLEE_ABI
);
2175 return function_abis
[INTVAL (XVECEXP (unspec
, 0, 0))];
2178 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2179 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2180 clobbers the top 64 bits when restoring the bottom 64 bits. */
2183 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id
,
2187 if (FP_REGNUM_P (regno
) && abi_id
!= ARM_PCS_SVE
)
2189 poly_int64 per_register_size
= GET_MODE_SIZE (mode
);
2190 unsigned int nregs
= hard_regno_nregs (regno
, mode
);
2192 per_register_size
= exact_div (per_register_size
, nregs
);
2193 if (abi_id
== ARM_PCS_SIMD
|| abi_id
== ARM_PCS_TLSDESC
)
2194 return maybe_gt (per_register_size
, 16);
2195 return maybe_gt (per_register_size
, 8);
2200 /* Implement REGMODE_NATURAL_SIZE. */
2202 aarch64_regmode_natural_size (machine_mode mode
)
2204 /* The natural size for SVE data modes is one SVE data vector,
2205 and similarly for predicates. We can't independently modify
2206 anything smaller than that. */
2207 /* ??? For now, only do this for variable-width SVE registers.
2208 Doing it for constant-sized registers breaks lower-subreg.c. */
2209 /* ??? And once that's fixed, we should probably have similar
2210 code for Advanced SIMD. */
2211 if (!aarch64_sve_vg
.is_constant ())
2213 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
2214 if (vec_flags
& VEC_SVE_PRED
)
2215 return BYTES_PER_SVE_PRED
;
2216 if (vec_flags
& VEC_SVE_DATA
)
2217 return BYTES_PER_SVE_VECTOR
;
2219 return UNITS_PER_WORD
;
2222 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2224 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
2227 /* The predicate mode determines which bits are significant and
2228 which are "don't care". Decreasing the number of lanes would
2229 lose data while increasing the number of lanes would make bits
2230 unnecessarily significant. */
2231 if (PR_REGNUM_P (regno
))
2233 if (known_ge (GET_MODE_SIZE (mode
), 4))
2239 /* Return true if I's bits are consecutive ones from the MSB. */
2241 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
2243 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
2246 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2247 that strcpy from constants will be faster. */
2249 static HOST_WIDE_INT
2250 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
2252 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
2253 return MAX (align
, BITS_PER_WORD
);
2257 /* Return true if calls to DECL should be treated as
2258 long-calls (ie called via a register). */
2260 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
2265 /* Return true if calls to symbol-ref SYM should be treated as
2266 long-calls (ie called via a register). */
2268 aarch64_is_long_call_p (rtx sym
)
2270 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
2273 /* Return true if calls to symbol-ref SYM should not go through
2277 aarch64_is_noplt_call_p (rtx sym
)
2279 const_tree decl
= SYMBOL_REF_DECL (sym
);
2284 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
2285 && !targetm
.binds_local_p (decl
))
2291 /* Return true if the offsets to a zero/sign-extract operation
2292 represent an expression that matches an extend operation. The
2293 operands represent the paramters from
2295 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2297 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
2300 HOST_WIDE_INT mult_val
, extract_val
;
2302 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
2305 mult_val
= INTVAL (mult_imm
);
2306 extract_val
= INTVAL (extract_imm
);
2309 && extract_val
< GET_MODE_BITSIZE (mode
)
2310 && exact_log2 (extract_val
& ~7) > 0
2311 && (extract_val
& 7) <= 4
2312 && mult_val
== (1 << (extract_val
& 7)))
2318 /* Emit an insn that's a simple single-set. Both the operands must be
2319 known to be valid. */
2320 inline static rtx_insn
*
2321 emit_set_insn (rtx x
, rtx y
)
2323 return emit_insn (gen_rtx_SET (x
, y
));
2326 /* X and Y are two things to compare using CODE. Emit the compare insn and
2327 return the rtx for register 0 in the proper mode. */
2329 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
2331 machine_mode cmp_mode
= GET_MODE (x
);
2332 machine_mode cc_mode
;
2335 if (cmp_mode
== TImode
)
2337 gcc_assert (code
== NE
);
2340 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2342 rtx x_lo
= operand_subword (x
, 0, 0, TImode
);
2343 rtx y_lo
= operand_subword (y
, 0, 0, TImode
);
2344 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x_lo
, y_lo
));
2346 rtx x_hi
= operand_subword (x
, 1, 0, TImode
);
2347 rtx y_hi
= operand_subword (y
, 1, 0, TImode
);
2348 emit_insn (gen_ccmpccdi (cc_reg
, cc_reg
, x_hi
, y_hi
,
2349 gen_rtx_EQ (cc_mode
, cc_reg
, const0_rtx
),
2350 GEN_INT (AARCH64_EQ
)));
2354 cc_mode
= SELECT_CC_MODE (code
, x
, y
);
2355 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2356 emit_set_insn (cc_reg
, gen_rtx_COMPARE (cc_mode
, x
, y
));
2361 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2364 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2365 machine_mode y_mode
)
2367 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2369 if (CONST_INT_P (y
))
2370 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2374 machine_mode cc_mode
;
2376 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2377 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2378 cc_mode
= CC_SWPmode
;
2379 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2380 emit_set_insn (cc_reg
, t
);
2385 if (!aarch64_plus_operand (y
, y_mode
))
2386 y
= force_reg (y_mode
, y
);
2388 return aarch64_gen_compare_reg (code
, x
, y
);
2391 /* Build the SYMBOL_REF for __tls_get_addr. */
2393 static GTY(()) rtx tls_get_addr_libfunc
;
2396 aarch64_tls_get_addr (void)
2398 if (!tls_get_addr_libfunc
)
2399 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2400 return tls_get_addr_libfunc
;
2403 /* Return the TLS model to use for ADDR. */
2405 static enum tls_model
2406 tls_symbolic_operand_type (rtx addr
)
2408 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2409 if (GET_CODE (addr
) == CONST
)
2412 rtx sym
= strip_offset (addr
, &addend
);
2413 if (GET_CODE (sym
) == SYMBOL_REF
)
2414 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2416 else if (GET_CODE (addr
) == SYMBOL_REF
)
2417 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2422 /* We'll allow lo_sum's in addresses in our legitimate addresses
2423 so that combine would take care of combining addresses where
2424 necessary, but for generation purposes, we'll generate the address
2427 tmp = hi (symbol_ref); adrp x1, foo
2428 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2432 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2433 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2437 Load TLS symbol, depending on TLS mechanism and TLS access model.
2439 Global Dynamic - Traditional TLS:
2440 adrp tmp, :tlsgd:imm
2441 add dest, tmp, #:tlsgd_lo12:imm
2444 Global Dynamic - TLS Descriptors:
2445 adrp dest, :tlsdesc:imm
2446 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2447 add dest, dest, #:tlsdesc_lo12:imm
2454 adrp tmp, :gottprel:imm
2455 ldr dest, [tmp, #:gottprel_lo12:imm]
2460 add t0, tp, #:tprel_hi12:imm, lsl #12
2461 add t0, t0, #:tprel_lo12_nc:imm
2465 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2466 enum aarch64_symbol_type type
)
2470 case SYMBOL_SMALL_ABSOLUTE
:
2472 /* In ILP32, the mode of dest can be either SImode or DImode. */
2474 machine_mode mode
= GET_MODE (dest
);
2476 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2478 if (can_create_pseudo_p ())
2479 tmp_reg
= gen_reg_rtx (mode
);
2481 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2482 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2486 case SYMBOL_TINY_ABSOLUTE
:
2487 emit_insn (gen_rtx_SET (dest
, imm
));
2490 case SYMBOL_SMALL_GOT_28K
:
2492 machine_mode mode
= GET_MODE (dest
);
2493 rtx gp_rtx
= pic_offset_table_rtx
;
2497 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2498 here before rtl expand. Tree IVOPT will generate rtl pattern to
2499 decide rtx costs, in which case pic_offset_table_rtx is not
2500 initialized. For that case no need to generate the first adrp
2501 instruction as the final cost for global variable access is
2505 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2506 using the page base as GOT base, the first page may be wasted,
2507 in the worst scenario, there is only 28K space for GOT).
2509 The generate instruction sequence for accessing global variable
2512 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2514 Only one instruction needed. But we must initialize
2515 pic_offset_table_rtx properly. We generate initialize insn for
2516 every global access, and allow CSE to remove all redundant.
2518 The final instruction sequences will look like the following
2519 for multiply global variables access.
2521 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2523 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2524 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2525 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2528 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2529 crtl
->uses_pic_offset_table
= 1;
2530 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2532 if (mode
!= GET_MODE (gp_rtx
))
2533 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2537 if (mode
== ptr_mode
)
2540 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2542 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2544 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2548 gcc_assert (mode
== Pmode
);
2550 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2551 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2554 /* The operand is expected to be MEM. Whenever the related insn
2555 pattern changed, above code which calculate mem should be
2557 gcc_assert (GET_CODE (mem
) == MEM
);
2558 MEM_READONLY_P (mem
) = 1;
2559 MEM_NOTRAP_P (mem
) = 1;
2564 case SYMBOL_SMALL_GOT_4G
:
2566 /* In ILP32, the mode of dest can be either SImode or DImode,
2567 while the got entry is always of SImode size. The mode of
2568 dest depends on how dest is used: if dest is assigned to a
2569 pointer (e.g. in the memory), it has SImode; it may have
2570 DImode if dest is dereferenced to access the memeory.
2571 This is why we have to handle three different ldr_got_small
2572 patterns here (two patterns for ILP32). */
2577 machine_mode mode
= GET_MODE (dest
);
2579 if (can_create_pseudo_p ())
2580 tmp_reg
= gen_reg_rtx (mode
);
2582 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2583 if (mode
== ptr_mode
)
2586 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2588 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2590 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2594 gcc_assert (mode
== Pmode
);
2596 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2597 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2600 gcc_assert (GET_CODE (mem
) == MEM
);
2601 MEM_READONLY_P (mem
) = 1;
2602 MEM_NOTRAP_P (mem
) = 1;
2607 case SYMBOL_SMALL_TLSGD
:
2610 machine_mode mode
= GET_MODE (dest
);
2611 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2615 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2617 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2618 insns
= get_insns ();
2621 RTL_CONST_CALL_P (insns
) = 1;
2622 emit_libcall_block (insns
, dest
, result
, imm
);
2626 case SYMBOL_SMALL_TLSDESC
:
2628 machine_mode mode
= GET_MODE (dest
);
2629 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2632 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2634 /* In ILP32, the got entry is always of SImode size. Unlike
2635 small GOT, the dest is fixed at reg 0. */
2637 emit_insn (gen_tlsdesc_small_si (imm
));
2639 emit_insn (gen_tlsdesc_small_di (imm
));
2640 tp
= aarch64_load_tp (NULL
);
2643 tp
= gen_lowpart (mode
, tp
);
2645 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2647 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2651 case SYMBOL_SMALL_TLSIE
:
2653 /* In ILP32, the mode of dest can be either SImode or DImode,
2654 while the got entry is always of SImode size. The mode of
2655 dest depends on how dest is used: if dest is assigned to a
2656 pointer (e.g. in the memory), it has SImode; it may have
2657 DImode if dest is dereferenced to access the memeory.
2658 This is why we have to handle three different tlsie_small
2659 patterns here (two patterns for ILP32). */
2660 machine_mode mode
= GET_MODE (dest
);
2661 rtx tmp_reg
= gen_reg_rtx (mode
);
2662 rtx tp
= aarch64_load_tp (NULL
);
2664 if (mode
== ptr_mode
)
2667 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2670 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2671 tp
= gen_lowpart (mode
, tp
);
2676 gcc_assert (mode
== Pmode
);
2677 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2680 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2682 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2686 case SYMBOL_TLSLE12
:
2687 case SYMBOL_TLSLE24
:
2688 case SYMBOL_TLSLE32
:
2689 case SYMBOL_TLSLE48
:
2691 machine_mode mode
= GET_MODE (dest
);
2692 rtx tp
= aarch64_load_tp (NULL
);
2695 tp
= gen_lowpart (mode
, tp
);
2699 case SYMBOL_TLSLE12
:
2700 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2703 case SYMBOL_TLSLE24
:
2704 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2707 case SYMBOL_TLSLE32
:
2708 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2710 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2713 case SYMBOL_TLSLE48
:
2714 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2716 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2724 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2728 case SYMBOL_TINY_GOT
:
2729 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2732 case SYMBOL_TINY_TLSIE
:
2734 machine_mode mode
= GET_MODE (dest
);
2735 rtx tp
= aarch64_load_tp (NULL
);
2737 if (mode
== ptr_mode
)
2740 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2743 tp
= gen_lowpart (mode
, tp
);
2744 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2749 gcc_assert (mode
== Pmode
);
2750 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2754 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2763 /* Emit a move from SRC to DEST. Assume that the move expanders can
2764 handle all moves if !can_create_pseudo_p (). The distinction is
2765 important because, unlike emit_move_insn, the move expanders know
2766 how to force Pmode objects into the constant pool even when the
2767 constant pool address is not itself legitimate. */
2769 aarch64_emit_move (rtx dest
, rtx src
)
2771 return (can_create_pseudo_p ()
2772 ? emit_move_insn (dest
, src
)
2773 : emit_move_insn_1 (dest
, src
));
2776 /* Apply UNOPTAB to OP and store the result in DEST. */
2779 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2781 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2783 emit_move_insn (dest
, tmp
);
2786 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2789 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2791 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2794 emit_move_insn (dest
, tmp
);
2797 /* Split a 128-bit move operation into two 64-bit move operations,
2798 taking care to handle partial overlap of register to register
2799 copies. Special cases are needed when moving between GP regs and
2800 FP regs. SRC can be a register, constant or memory; DST a register
2801 or memory. If either operand is memory it must not have any side
2804 aarch64_split_128bit_move (rtx dst
, rtx src
)
2809 machine_mode mode
= GET_MODE (dst
);
2811 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2812 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2813 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2815 if (REG_P (dst
) && REG_P (src
))
2817 int src_regno
= REGNO (src
);
2818 int dst_regno
= REGNO (dst
);
2820 /* Handle FP <-> GP regs. */
2821 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2823 src_lo
= gen_lowpart (word_mode
, src
);
2824 src_hi
= gen_highpart (word_mode
, src
);
2826 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2827 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2830 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2832 dst_lo
= gen_lowpart (word_mode
, dst
);
2833 dst_hi
= gen_highpart (word_mode
, dst
);
2835 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2836 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2841 dst_lo
= gen_lowpart (word_mode
, dst
);
2842 dst_hi
= gen_highpart (word_mode
, dst
);
2843 src_lo
= gen_lowpart (word_mode
, src
);
2844 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2846 /* At most one pairing may overlap. */
2847 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2849 aarch64_emit_move (dst_hi
, src_hi
);
2850 aarch64_emit_move (dst_lo
, src_lo
);
2854 aarch64_emit_move (dst_lo
, src_lo
);
2855 aarch64_emit_move (dst_hi
, src_hi
);
2860 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2862 return (! REG_P (src
)
2863 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2866 /* Split a complex SIMD combine. */
2869 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2871 machine_mode src_mode
= GET_MODE (src1
);
2872 machine_mode dst_mode
= GET_MODE (dst
);
2874 gcc_assert (VECTOR_MODE_P (dst_mode
));
2875 gcc_assert (register_operand (dst
, dst_mode
)
2876 && register_operand (src1
, src_mode
)
2877 && register_operand (src2
, src_mode
));
2879 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2883 /* Split a complex SIMD move. */
2886 aarch64_split_simd_move (rtx dst
, rtx src
)
2888 machine_mode src_mode
= GET_MODE (src
);
2889 machine_mode dst_mode
= GET_MODE (dst
);
2891 gcc_assert (VECTOR_MODE_P (dst_mode
));
2893 if (REG_P (dst
) && REG_P (src
))
2895 gcc_assert (VECTOR_MODE_P (src_mode
));
2896 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2901 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2902 machine_mode ymode
, rtx y
)
2904 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2905 gcc_assert (r
!= NULL
);
2906 return rtx_equal_p (x
, r
);
2909 /* Return TARGET if it is nonnull and a register of mode MODE.
2910 Otherwise, return a fresh register of mode MODE if we can,
2911 or TARGET reinterpreted as MODE if we can't. */
2914 aarch64_target_reg (rtx target
, machine_mode mode
)
2916 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2918 if (!can_create_pseudo_p ())
2920 gcc_assert (target
);
2921 return gen_lowpart (mode
, target
);
2923 return gen_reg_rtx (mode
);
2926 /* Return a register that contains the constant in BUILDER, given that
2927 the constant is a legitimate move operand. Use TARGET as the register
2928 if it is nonnull and convenient. */
2931 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2933 rtx src
= builder
.build ();
2934 target
= aarch64_target_reg (target
, GET_MODE (src
));
2935 emit_insn (gen_rtx_SET (target
, src
));
2940 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2942 if (can_create_pseudo_p ())
2943 return force_reg (mode
, value
);
2947 aarch64_emit_move (x
, value
);
2952 /* Return true if predicate value X is a constant in which every element
2953 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2954 value, i.e. as a predicate in which all bits are significant. */
2957 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2959 if (GET_CODE (x
) != CONST_VECTOR
)
2962 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2963 GET_MODE_NUNITS (GET_MODE (x
)));
2964 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2965 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2966 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2968 unsigned int nelts
= const_vector_encoded_nelts (x
);
2969 for (unsigned int i
= 0; i
< nelts
; ++i
)
2971 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2972 if (!CONST_INT_P (elt
))
2975 builder
.quick_push (elt
);
2976 for (unsigned int j
= 1; j
< factor
; ++j
)
2977 builder
.quick_push (const0_rtx
);
2979 builder
.finalize ();
2983 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2984 widest predicate element size it can have (that is, the largest size
2985 for which each element would still be 0 or 1). */
2988 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2990 /* Start with the most optimistic assumption: that we only need
2991 one bit per pattern. This is what we will use if only the first
2992 bit in each pattern is ever set. */
2993 unsigned int mask
= GET_MODE_SIZE (DImode
);
2994 mask
|= builder
.npatterns ();
2996 /* Look for set bits. */
2997 unsigned int nelts
= builder
.encoded_nelts ();
2998 for (unsigned int i
= 1; i
< nelts
; ++i
)
2999 if (INTVAL (builder
.elt (i
)) != 0)
3005 return mask
& -mask
;
3008 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3009 return that predicate mode, otherwise return opt_machine_mode (). */
3012 aarch64_ptrue_all_mode (rtx x
)
3014 gcc_assert (GET_MODE (x
) == VNx16BImode
);
3015 if (GET_CODE (x
) != CONST_VECTOR
3016 || !CONST_VECTOR_DUPLICATE_P (x
)
3017 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x
, 0))
3018 || INTVAL (CONST_VECTOR_ENCODED_ELT (x
, 0)) == 0)
3019 return opt_machine_mode ();
3021 unsigned int nelts
= const_vector_encoded_nelts (x
);
3022 for (unsigned int i
= 1; i
< nelts
; ++i
)
3023 if (CONST_VECTOR_ENCODED_ELT (x
, i
) != const0_rtx
)
3024 return opt_machine_mode ();
3026 return aarch64_sve_pred_mode (nelts
);
3029 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3030 that the constant would have with predicate element size ELT_SIZE
3031 (ignoring the upper bits in each element) and return:
3033 * -1 if all bits are set
3034 * N if the predicate has N leading set bits followed by all clear bits
3035 * 0 if the predicate does not have any of these forms. */
3038 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
3039 unsigned int elt_size
)
3041 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3042 followed by set bits. */
3043 if (builder
.nelts_per_pattern () == 3)
3046 /* Skip over leading set bits. */
3047 unsigned int nelts
= builder
.encoded_nelts ();
3049 for (; i
< nelts
; i
+= elt_size
)
3050 if (INTVAL (builder
.elt (i
)) == 0)
3052 unsigned int vl
= i
/ elt_size
;
3054 /* Check for the all-true case. */
3058 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3059 repeating pattern of set bits followed by clear bits. */
3060 if (builder
.nelts_per_pattern () != 2)
3063 /* We have a "foreground" value and a duplicated "background" value.
3064 If the background might repeat and the last set bit belongs to it,
3065 we might have set bits followed by clear bits followed by set bits. */
3066 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
3069 /* Make sure that the rest are all clear. */
3070 for (; i
< nelts
; i
+= elt_size
)
3071 if (INTVAL (builder
.elt (i
)) != 0)
3077 /* See if there is an svpattern that encodes an SVE predicate of mode
3078 PRED_MODE in which the first VL bits are set and the rest are clear.
3079 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3080 A VL of -1 indicates an all-true vector. */
3083 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
3086 return AARCH64_SV_ALL
;
3088 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
3089 return AARCH64_NUM_SVPATTERNS
;
3091 if (vl
>= 1 && vl
<= 8)
3092 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
3094 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
3095 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
3098 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
3100 if (vl
== (max_vl
/ 3) * 3)
3101 return AARCH64_SV_MUL3
;
3102 /* These would only trigger for non-power-of-2 lengths. */
3103 if (vl
== (max_vl
& -4))
3104 return AARCH64_SV_MUL4
;
3105 if (vl
== (1 << floor_log2 (max_vl
)))
3106 return AARCH64_SV_POW2
;
3108 return AARCH64_SV_ALL
;
3110 return AARCH64_NUM_SVPATTERNS
;
3113 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3114 bits has the lowest bit set and the upper bits clear. This is the
3115 VNx16BImode equivalent of a PTRUE for controlling elements of
3116 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3117 all bits are significant, even the upper zeros. */
3120 aarch64_ptrue_all (unsigned int elt_size
)
3122 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
3123 builder
.quick_push (const1_rtx
);
3124 for (unsigned int i
= 1; i
< elt_size
; ++i
)
3125 builder
.quick_push (const0_rtx
);
3126 return builder
.build ();
3129 /* Return an all-true predicate register of mode MODE. */
3132 aarch64_ptrue_reg (machine_mode mode
)
3134 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3135 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3136 return gen_lowpart (mode
, reg
);
3139 /* Return an all-false predicate register of mode MODE. */
3142 aarch64_pfalse_reg (machine_mode mode
)
3144 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
3145 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
3146 return gen_lowpart (mode
, reg
);
3149 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3150 true, or alternatively if we know that the operation predicated by
3151 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3152 aarch64_sve_gp_strictness operand that describes the operation
3153 predicated by PRED1[0]. */
3156 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
3158 machine_mode mode
= GET_MODE (pred2
);
3159 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3160 && mode
== GET_MODE (pred1
[0])
3161 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
3162 return (pred1
[0] == CONSTM1_RTX (mode
)
3163 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
3164 || rtx_equal_p (pred1
[0], pred2
));
3167 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3168 for it. PRED2[0] is the predicate for the instruction whose result
3169 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3170 for it. Return true if we can prove that the two predicates are
3171 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3172 with PRED1[0] without changing behavior. */
3175 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
3177 machine_mode mode
= GET_MODE (pred1
[0]);
3178 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
3179 && mode
== GET_MODE (pred2
[0])
3180 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
3181 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
3183 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
3184 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
3185 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
3186 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
3187 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
3190 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3191 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3192 Use TARGET as the target register if nonnull and convenient. */
3195 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
3196 machine_mode data_mode
, rtx op1
, rtx op2
)
3198 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
3199 expand_operand ops
[5];
3200 create_output_operand (&ops
[0], target
, pred_mode
);
3201 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
3202 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
3203 create_input_operand (&ops
[3], op1
, data_mode
);
3204 create_input_operand (&ops
[4], op2
, data_mode
);
3205 expand_insn (icode
, 5, ops
);
3206 return ops
[0].value
;
3209 /* Use a comparison to convert integer vector SRC into MODE, which is
3210 the corresponding SVE predicate mode. Use TARGET for the result
3211 if it's nonnull and convenient. */
3214 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
3216 machine_mode src_mode
= GET_MODE (src
);
3217 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
3218 src
, CONST0_RTX (src_mode
));
3221 /* Return the assembly token for svprfop value PRFOP. */
3224 svprfop_token (enum aarch64_svprfop prfop
)
3228 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3229 AARCH64_FOR_SVPRFOP (CASE
)
3231 case AARCH64_NUM_SVPRFOPS
:
3237 /* Return the assembly string for an SVE prefetch operation with
3238 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3239 and that SUFFIX is the format for the remaining operands. */
3242 aarch64_output_sve_prefetch (const char *mnemonic
, rtx prfop_rtx
,
3245 static char buffer
[128];
3246 aarch64_svprfop prfop
= (aarch64_svprfop
) INTVAL (prfop_rtx
);
3247 unsigned int written
= snprintf (buffer
, sizeof (buffer
), "%s\t%s, %s",
3248 mnemonic
, svprfop_token (prfop
), suffix
);
3249 gcc_assert (written
< sizeof (buffer
));
3253 /* Check whether we can calculate the number of elements in PATTERN
3254 at compile time, given that there are NELTS_PER_VQ elements per
3255 128-bit block. Return the value if so, otherwise return -1. */
3258 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern
, unsigned int nelts_per_vq
)
3260 unsigned int vl
, const_vg
;
3261 if (pattern
>= AARCH64_SV_VL1
&& pattern
<= AARCH64_SV_VL8
)
3262 vl
= 1 + (pattern
- AARCH64_SV_VL1
);
3263 else if (pattern
>= AARCH64_SV_VL16
&& pattern
<= AARCH64_SV_VL256
)
3264 vl
= 16 << (pattern
- AARCH64_SV_VL16
);
3265 else if (aarch64_sve_vg
.is_constant (&const_vg
))
3267 /* There are two vector granules per quadword. */
3268 unsigned int nelts
= (const_vg
/ 2) * nelts_per_vq
;
3271 case AARCH64_SV_POW2
: return 1 << floor_log2 (nelts
);
3272 case AARCH64_SV_MUL4
: return nelts
& -4;
3273 case AARCH64_SV_MUL3
: return (nelts
/ 3) * 3;
3274 case AARCH64_SV_ALL
: return nelts
;
3275 default: gcc_unreachable ();
3281 /* There are two vector granules per quadword. */
3282 poly_uint64 nelts_all
= exact_div (aarch64_sve_vg
, 2) * nelts_per_vq
;
3283 if (known_le (vl
, nelts_all
))
3286 /* Requesting more elements than are available results in a PFALSE. */
3287 if (known_gt (vl
, nelts_all
))
3293 /* Return true if we can move VALUE into a register using a single
3294 CNT[BHWD] instruction. */
3297 aarch64_sve_cnt_immediate_p (poly_int64 value
)
3299 HOST_WIDE_INT factor
= value
.coeffs
[0];
3300 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3301 return (value
.coeffs
[1] == factor
3302 && IN_RANGE (factor
, 2, 16 * 16)
3303 && (factor
& 1) == 0
3304 && factor
<= 16 * (factor
& -factor
));
3307 /* Likewise for rtx X. */
3310 aarch64_sve_cnt_immediate_p (rtx x
)
3313 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
3316 /* Return the asm string for an instruction with a CNT-like vector size
3317 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3318 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3319 first part of the operands template (the part that comes before the
3320 vector size itself). PATTERN is the pattern to use. FACTOR is the
3321 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3322 in each quadword. If it is zero, we can use any element size. */
3325 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3326 aarch64_svpattern pattern
,
3327 unsigned int factor
,
3328 unsigned int nelts_per_vq
)
3330 static char buffer
[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3332 if (nelts_per_vq
== 0)
3333 /* There is some overlap in the ranges of the four CNT instructions.
3334 Here we always use the smallest possible element size, so that the
3335 multiplier is 1 whereever possible. */
3336 nelts_per_vq
= factor
& -factor
;
3337 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
3338 gcc_assert (IN_RANGE (shift
, 1, 4));
3339 char suffix
= "dwhb"[shift
- 1];
3342 unsigned int written
;
3343 if (pattern
== AARCH64_SV_ALL
&& factor
== 1)
3344 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
3345 prefix
, suffix
, operands
);
3346 else if (factor
== 1)
3347 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s",
3348 prefix
, suffix
, operands
, svpattern_token (pattern
));
3350 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, %s, mul #%d",
3351 prefix
, suffix
, operands
, svpattern_token (pattern
),
3353 gcc_assert (written
< sizeof (buffer
));
3357 /* Return the asm string for an instruction with a CNT-like vector size
3358 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3359 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3360 first part of the operands template (the part that comes before the
3361 vector size itself). X is the value of the vector size operand,
3362 as a polynomial integer rtx; we need to convert this into an "all"
3363 pattern with a multiplier. */
3366 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
3369 poly_int64 value
= rtx_to_poly_int64 (x
);
3370 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
3371 return aarch64_output_sve_cnt_immediate (prefix
, operands
, AARCH64_SV_ALL
,
3372 value
.coeffs
[1], 0);
3375 /* Return the asm string for an instruction with a CNT-like vector size
3376 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3377 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3378 first part of the operands template (the part that comes before the
3379 vector size itself). CNT_PAT[0..2] are the operands of the
3380 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3383 aarch64_output_sve_cnt_pat_immediate (const char *prefix
,
3384 const char *operands
, rtx
*cnt_pat
)
3386 aarch64_svpattern pattern
= (aarch64_svpattern
) INTVAL (cnt_pat
[0]);
3387 unsigned int nelts_per_vq
= INTVAL (cnt_pat
[1]);
3388 unsigned int factor
= INTVAL (cnt_pat
[2]) * nelts_per_vq
;
3389 return aarch64_output_sve_cnt_immediate (prefix
, operands
, pattern
,
3390 factor
, nelts_per_vq
);
3393 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3396 aarch64_sve_scalar_inc_dec_immediate_p (rtx x
)
3399 return (poly_int_rtx_p (x
, &value
)
3400 && (aarch64_sve_cnt_immediate_p (value
)
3401 || aarch64_sve_cnt_immediate_p (-value
)));
3404 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3408 aarch64_output_sve_scalar_inc_dec (rtx offset
)
3410 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3411 gcc_assert (offset_value
.coeffs
[0] == offset_value
.coeffs
[1]);
3412 if (offset_value
.coeffs
[1] > 0)
3413 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL
,
3414 offset_value
.coeffs
[1], 0);
3416 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL
,
3417 -offset_value
.coeffs
[1], 0);
3420 /* Return true if we can add VALUE to a register using a single ADDVL
3421 or ADDPL instruction. */
3424 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
3426 HOST_WIDE_INT factor
= value
.coeffs
[0];
3427 if (factor
== 0 || value
.coeffs
[1] != factor
)
3429 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3430 and a value of 16 is one vector width. */
3431 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
3432 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
3435 /* Likewise for rtx X. */
3438 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
3441 return (poly_int_rtx_p (x
, &value
)
3442 && aarch64_sve_addvl_addpl_immediate_p (value
));
3445 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3446 to operand 1 and storing the result in operand 0. */
3449 aarch64_output_sve_addvl_addpl (rtx offset
)
3451 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3452 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
3453 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
3455 int factor
= offset_value
.coeffs
[1];
3456 if ((factor
& 15) == 0)
3457 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
3459 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
3463 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3464 instruction. If it is, store the number of elements in each vector
3465 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3466 factor in *FACTOR_OUT (if nonnull). */
3469 aarch64_sve_vector_inc_dec_immediate_p (rtx x
, int *factor_out
,
3470 unsigned int *nelts_per_vq_out
)
3475 if (!const_vec_duplicate_p (x
, &elt
)
3476 || !poly_int_rtx_p (elt
, &value
))
3479 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
3480 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
3481 /* There's no vector INCB. */
3484 HOST_WIDE_INT factor
= value
.coeffs
[0];
3485 if (value
.coeffs
[1] != factor
)
3488 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3489 if ((factor
% nelts_per_vq
) != 0
3490 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3494 *factor_out
= factor
;
3495 if (nelts_per_vq_out
)
3496 *nelts_per_vq_out
= nelts_per_vq
;
3500 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3504 aarch64_sve_vector_inc_dec_immediate_p (rtx x
)
3506 return aarch64_sve_vector_inc_dec_immediate_p (x
, NULL
, NULL
);
3509 /* Return the asm template for an SVE vector INC or DEC instruction.
3510 OPERANDS gives the operands before the vector count and X is the
3511 value of the vector count operand itself. */
3514 aarch64_output_sve_vector_inc_dec (const char *operands
, rtx x
)
3517 unsigned int nelts_per_vq
;
3518 if (!aarch64_sve_vector_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3521 return aarch64_output_sve_cnt_immediate ("dec", operands
, AARCH64_SV_ALL
,
3522 -factor
, nelts_per_vq
);
3524 return aarch64_output_sve_cnt_immediate ("inc", operands
, AARCH64_SV_ALL
,
3525 factor
, nelts_per_vq
);
3529 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3530 scalar_int_mode mode
)
3533 unsigned HOST_WIDE_INT val
, val2
, mask
;
3534 int one_match
, zero_match
;
3539 if (aarch64_move_imm (val
, mode
))
3542 emit_insn (gen_rtx_SET (dest
, imm
));
3546 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3547 (with XXXX non-zero). In that case check to see if the move can be done in
3549 val2
= val
& 0xffffffff;
3551 && aarch64_move_imm (val2
, SImode
)
3552 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3555 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3557 /* Check if we have to emit a second instruction by checking to see
3558 if any of the upper 32 bits of the original DI mode value is set. */
3562 i
= (val
>> 48) ? 48 : 32;
3565 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3566 GEN_INT ((val
>> i
) & 0xffff)));
3571 if ((val
>> 32) == 0 || mode
== SImode
)
3575 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3577 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3578 GEN_INT ((val
>> 16) & 0xffff)));
3580 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3581 GEN_INT ((val
>> 16) & 0xffff)));
3586 /* Remaining cases are all for DImode. */
3589 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3590 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3591 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3592 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3594 if (zero_match
!= 2 && one_match
!= 2)
3596 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3597 For a 64-bit bitmask try whether changing 16 bits to all ones or
3598 zeroes creates a valid bitmask. To check any repeated bitmask,
3599 try using 16 bits from the other 32-bit half of val. */
3601 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3604 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3607 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3609 val2
= val2
& ~mask
;
3610 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3611 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3618 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3619 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3620 GEN_INT ((val
>> i
) & 0xffff)));
3626 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3627 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3628 otherwise skip zero bits. */
3632 val2
= one_match
> zero_match
? ~val
: val
;
3633 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3636 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3637 ? (val
| ~(mask
<< i
))
3638 : (val
& (mask
<< i
)))));
3639 for (i
+= 16; i
< 64; i
+= 16)
3641 if ((val2
& (mask
<< i
)) == 0)
3644 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3645 GEN_INT ((val
>> i
) & 0xffff)));
3652 /* Return whether imm is a 128-bit immediate which is simple enough to
3655 aarch64_mov128_immediate (rtx imm
)
3657 if (GET_CODE (imm
) == CONST_INT
)
3660 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3662 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3663 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3665 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3666 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3670 /* Return the number of temporary registers that aarch64_add_offset_1
3671 would need to add OFFSET to a register. */
3674 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3676 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3679 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3680 a non-polynomial OFFSET. MODE is the mode of the addition.
3681 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3682 be set and CFA adjustments added to the generated instructions.
3684 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3685 temporary if register allocation is already complete. This temporary
3686 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3687 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3688 the immediate again.
3690 Since this function may be used to adjust the stack pointer, we must
3691 ensure that it cannot cause transient stack deallocation (for example
3692 by first incrementing SP and then decrementing when adjusting by a
3693 large immediate). */
3696 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3697 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3698 bool frame_related_p
, bool emit_move_imm
)
3700 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3701 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3703 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3708 if (!rtx_equal_p (dest
, src
))
3710 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3711 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3716 /* Single instruction adjustment. */
3717 if (aarch64_uimm12_shift (moffset
))
3719 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3720 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3724 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3727 a) the offset cannot be loaded by a 16-bit move or
3728 b) there is no spare register into which we can move it. */
3729 if (moffset
< 0x1000000
3730 && ((!temp1
&& !can_create_pseudo_p ())
3731 || !aarch64_move_imm (moffset
, mode
)))
3733 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3735 low_off
= offset
< 0 ? -low_off
: low_off
;
3736 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3737 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3738 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3739 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3743 /* Emit a move immediate if required and an addition/subtraction. */
3746 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3747 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3749 insn
= emit_insn (offset
< 0
3750 ? gen_sub3_insn (dest
, src
, temp1
)
3751 : gen_add3_insn (dest
, src
, temp1
));
3752 if (frame_related_p
)
3754 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3755 rtx adj
= plus_constant (mode
, src
, offset
);
3756 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3760 /* Return the number of temporary registers that aarch64_add_offset
3761 would need to move OFFSET into a register or add OFFSET to a register;
3762 ADD_P is true if we want the latter rather than the former. */
3765 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3767 /* This follows the same structure as aarch64_add_offset. */
3768 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3771 unsigned int count
= 0;
3772 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3773 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3774 poly_int64
poly_offset (factor
, factor
);
3775 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3776 /* Need one register for the ADDVL/ADDPL result. */
3778 else if (factor
!= 0)
3780 factor
= abs (factor
);
3781 if (factor
> 16 * (factor
& -factor
))
3782 /* Need one register for the CNT result and one for the multiplication
3783 factor. If necessary, the second temporary can be reused for the
3784 constant part of the offset. */
3786 /* Need one register for the CNT result (which might then
3790 return count
+ aarch64_add_offset_1_temporaries (constant
);
3793 /* If X can be represented as a poly_int64, return the number
3794 of temporaries that are required to add it to a register.
3795 Return -1 otherwise. */
3798 aarch64_add_offset_temporaries (rtx x
)
3801 if (!poly_int_rtx_p (x
, &offset
))
3803 return aarch64_offset_temporaries (true, offset
);
3806 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3807 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3808 be set and CFA adjustments added to the generated instructions.
3810 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3811 temporary if register allocation is already complete. This temporary
3812 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3813 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3814 false to avoid emitting the immediate again.
3816 TEMP2, if nonnull, is a second temporary register that doesn't
3817 overlap either DEST or REG.
3819 Since this function may be used to adjust the stack pointer, we must
3820 ensure that it cannot cause transient stack deallocation (for example
3821 by first incrementing SP and then decrementing when adjusting by a
3822 large immediate). */
3825 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3826 poly_int64 offset
, rtx temp1
, rtx temp2
,
3827 bool frame_related_p
, bool emit_move_imm
= true)
3829 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3830 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3831 gcc_assert (temp1
== NULL_RTX
3833 || !reg_overlap_mentioned_p (temp1
, dest
));
3834 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3836 /* Try using ADDVL or ADDPL to add the whole value. */
3837 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3839 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3840 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3841 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3845 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3846 SVE vector register, over and above the minimum size of 128 bits.
3847 This is equivalent to half the value returned by CNTD with a
3848 vector shape of ALL. */
3849 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3850 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3852 /* Try using ADDVL or ADDPL to add the VG-based part. */
3853 poly_int64
poly_offset (factor
, factor
);
3854 if (src
!= const0_rtx
3855 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3857 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3858 if (frame_related_p
)
3860 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3861 RTX_FRAME_RELATED_P (insn
) = true;
3866 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3867 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3872 /* Otherwise use a CNT-based sequence. */
3873 else if (factor
!= 0)
3875 /* Use a subtraction if we have a negative factor. */
3876 rtx_code code
= PLUS
;
3883 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3884 into the multiplication. */
3888 /* Use a right shift by 1. */
3892 HOST_WIDE_INT low_bit
= factor
& -factor
;
3893 if (factor
<= 16 * low_bit
)
3895 if (factor
> 16 * 8)
3897 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3898 the value with the minimum multiplier and shift it into
3900 int extra_shift
= exact_log2 (low_bit
);
3901 shift
+= extra_shift
;
3902 factor
>>= extra_shift
;
3904 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3908 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3909 directly, since that should increase the chances of being
3910 able to use a shift and add sequence. If LOW_BIT itself
3911 is out of range, just use CNTD. */
3912 if (low_bit
<= 16 * 8)
3917 val
= gen_int_mode (poly_int64 (low_bit
* 2, low_bit
* 2), mode
);
3918 val
= aarch64_force_temporary (mode
, temp1
, val
);
3920 if (can_create_pseudo_p ())
3922 rtx coeff1
= gen_int_mode (factor
, mode
);
3923 val
= expand_mult (mode
, val
, coeff1
, NULL_RTX
, false, true);
3927 /* Go back to using a negative multiplication factor if we have
3928 no register from which to subtract. */
3929 if (code
== MINUS
&& src
== const0_rtx
)
3934 rtx coeff1
= gen_int_mode (factor
, mode
);
3935 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3936 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3942 /* Multiply by 1 << SHIFT. */
3943 val
= aarch64_force_temporary (mode
, temp1
, val
);
3944 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3946 else if (shift
== -1)
3949 val
= aarch64_force_temporary (mode
, temp1
, val
);
3950 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3953 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3954 if (src
!= const0_rtx
)
3956 val
= aarch64_force_temporary (mode
, temp1
, val
);
3957 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3959 else if (code
== MINUS
)
3961 val
= aarch64_force_temporary (mode
, temp1
, val
);
3962 val
= gen_rtx_NEG (mode
, val
);
3965 if (constant
== 0 || frame_related_p
)
3967 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3968 if (frame_related_p
)
3970 RTX_FRAME_RELATED_P (insn
) = true;
3971 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3972 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3981 src
= aarch64_force_temporary (mode
, temp1
, val
);
3986 emit_move_imm
= true;
3989 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3990 frame_related_p
, emit_move_imm
);
3993 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3994 than a poly_int64. */
3997 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3998 rtx offset_rtx
, rtx temp1
, rtx temp2
)
4000 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
4001 temp1
, temp2
, false);
4004 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4005 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4006 if TEMP1 already contains abs (DELTA). */
4009 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
4011 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
4012 temp1
, temp2
, true, emit_move_imm
);
4015 /* Subtract DELTA from the stack pointer, marking the instructions
4016 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4020 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
4021 bool emit_move_imm
= true)
4023 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
4024 temp1
, temp2
, frame_related_p
, emit_move_imm
);
4027 /* Set DEST to (vec_series BASE STEP). */
4030 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
4032 machine_mode mode
= GET_MODE (dest
);
4033 scalar_mode inner
= GET_MODE_INNER (mode
);
4035 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4036 if (!aarch64_sve_index_immediate_p (base
))
4037 base
= force_reg (inner
, base
);
4038 if (!aarch64_sve_index_immediate_p (step
))
4039 step
= force_reg (inner
, step
);
4041 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
4044 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4045 register of mode MODE. Use TARGET for the result if it's nonnull
4048 The two vector modes must have the same element mode. The behavior
4049 is to duplicate architectural lane N of SRC into architectural lanes
4050 N + I * STEP of the result. On big-endian targets, architectural
4051 lane 0 of an Advanced SIMD vector is the last element of the vector
4052 in memory layout, so for big-endian targets this operation has the
4053 effect of reversing SRC before duplicating it. Callers need to
4054 account for this. */
4057 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
4059 machine_mode src_mode
= GET_MODE (src
);
4060 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
4061 insn_code icode
= (BYTES_BIG_ENDIAN
4062 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
4063 : code_for_aarch64_vec_duplicate_vq_le (mode
));
4066 expand_operand ops
[3];
4067 create_output_operand (&ops
[i
++], target
, mode
);
4068 create_output_operand (&ops
[i
++], src
, src_mode
);
4069 if (BYTES_BIG_ENDIAN
)
4071 /* Create a PARALLEL describing the reversal of SRC. */
4072 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
4073 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
4074 nelts_per_vq
- 1, -1);
4075 create_fixed_operand (&ops
[i
++], sel
);
4077 expand_insn (icode
, i
, ops
);
4078 return ops
[0].value
;
4081 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4082 the memory image into DEST. Return true on success. */
4085 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
4087 src
= force_const_mem (GET_MODE (src
), src
);
4091 /* Make sure that the address is legitimate. */
4092 if (!aarch64_sve_ld1rq_operand_p (src
))
4094 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
4095 src
= replace_equiv_address (src
, addr
);
4098 machine_mode mode
= GET_MODE (dest
);
4099 machine_mode pred_mode
= aarch64_sve_pred_mode (mode
);
4100 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4101 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
4105 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4106 SVE data mode and isn't a legitimate constant. Use TARGET for the
4107 result if convenient.
4109 The returned register can have whatever mode seems most natural
4110 given the contents of SRC. */
4113 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
4115 machine_mode mode
= GET_MODE (src
);
4116 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
4117 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
4118 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
4119 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
4120 unsigned int container_bits
= aarch64_sve_container_bits (mode
);
4121 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* container_bits
;
4123 if (nelts_per_pattern
== 1
4124 && encoded_bits
<= 128
4125 && container_bits
!= elt_bits
)
4127 /* We have a partial vector mode and a constant whose full-vector
4128 equivalent would occupy a repeating 128-bit sequence. Build that
4129 full-vector equivalent instead, so that we have the option of
4130 using LD1RQ and Advanced SIMD operations. */
4131 unsigned int repeat
= container_bits
/ elt_bits
;
4132 machine_mode full_mode
= aarch64_full_sve_mode (elt_mode
).require ();
4133 rtx_vector_builder
builder (full_mode
, npatterns
* repeat
, 1);
4134 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4135 for (unsigned int j
= 0; j
< repeat
; ++j
)
4136 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, i
));
4137 target
= aarch64_target_reg (target
, full_mode
);
4138 return aarch64_expand_sve_const_vector (target
, builder
.build ());
4141 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
4143 /* The constant is a duplicated quadword but can't be narrowed
4144 beyond a quadword. Get the memory image of the first quadword
4145 as a 128-bit vector and try using LD1RQ to load it from memory.
4147 The effect for both endiannesses is to load memory lane N into
4148 architectural lanes N + I * STEP of the result. On big-endian
4149 targets, the layout of the 128-bit vector in an Advanced SIMD
4150 register would be different from its layout in an SVE register,
4151 but this 128-bit vector is a memory value only. */
4152 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4153 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
4154 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
4158 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
4160 /* The vector is a repeating sequence of 64 bits or fewer.
4161 See if we can load them using an Advanced SIMD move and then
4162 duplicate it to fill a vector. This is better than using a GPR
4163 move because it keeps everything in the same register file. */
4164 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
4165 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
4166 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4168 /* We want memory lane N to go into architectural lane N,
4169 so reverse for big-endian targets. The DUP .Q pattern
4170 has a compensating reverse built-in. */
4171 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
4172 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
4174 rtx vq_src
= builder
.build ();
4175 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
4177 vq_src
= force_reg (vq_mode
, vq_src
);
4178 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
4181 /* Get an integer representation of the repeating part of Advanced
4182 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4183 which for big-endian targets is lane-swapped wrt a normal
4184 Advanced SIMD vector. This means that for both endiannesses,
4185 memory lane N of SVE vector SRC corresponds to architectural
4186 lane N of a register holding VQ_SRC. This in turn means that
4187 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4188 as a single 128-bit value) and thus that memory lane 0 of SRC is
4189 in the lsb of the integer. Duplicating the integer therefore
4190 ensures that memory lane N of SRC goes into architectural lane
4191 N + I * INDEX of the SVE register. */
4192 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
4193 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
4196 /* Pretend that we had a vector of INT_MODE to start with. */
4197 elt_mode
= int_mode
;
4198 mode
= aarch64_full_sve_mode (int_mode
).require ();
4200 /* If the integer can be moved into a general register by a
4201 single instruction, do that and duplicate the result. */
4202 if (CONST_INT_P (elt_value
)
4203 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
4205 elt_value
= force_reg (elt_mode
, elt_value
);
4206 return expand_vector_broadcast (mode
, elt_value
);
4209 else if (npatterns
== 1)
4210 /* We're duplicating a single value, but can't do better than
4211 force it to memory and load from there. This handles things
4212 like symbolic constants. */
4213 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
4217 /* Load the element from memory if we can, otherwise move it into
4218 a register and use a DUP. */
4219 rtx op
= force_const_mem (elt_mode
, elt_value
);
4221 op
= force_reg (elt_mode
, elt_value
);
4222 return expand_vector_broadcast (mode
, op
);
4226 /* Try using INDEX. */
4228 if (const_vec_series_p (src
, &base
, &step
))
4230 aarch64_expand_vec_series (target
, base
, step
);
4234 /* From here on, it's better to force the whole constant to memory
4236 if (GET_MODE_NUNITS (mode
).is_constant ())
4239 /* Expand each pattern individually. */
4240 gcc_assert (npatterns
> 1);
4241 rtx_vector_builder builder
;
4242 auto_vec
<rtx
, 16> vectors (npatterns
);
4243 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4245 builder
.new_vector (mode
, 1, nelts_per_pattern
);
4246 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
4247 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
4248 vectors
.quick_push (force_reg (mode
, builder
.build ()));
4251 /* Use permutes to interleave the separate vectors. */
4252 while (npatterns
> 1)
4255 for (unsigned int i
= 0; i
< npatterns
; ++i
)
4257 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
4258 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
4259 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
4263 gcc_assert (vectors
[0] == target
);
4267 /* Use WHILE to set a predicate register of mode MODE in which the first
4268 VL bits are set and the rest are clear. Use TARGET for the register
4269 if it's nonnull and convenient. */
4272 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
4275 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
4276 target
= aarch64_target_reg (target
, mode
);
4277 emit_insn (gen_while (UNSPEC_WHILELO
, DImode
, mode
,
4278 target
, const0_rtx
, limit
));
4283 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
4285 /* BUILDER is a constant predicate in which the index of every set bit
4286 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4287 by inverting every element at a multiple of ELT_SIZE and EORing the
4288 result with an ELT_SIZE PTRUE.
4290 Return a register that contains the constant on success, otherwise
4291 return null. Use TARGET as the register if it is nonnull and
4295 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
4296 unsigned int elt_size
)
4298 /* Invert every element at a multiple of ELT_SIZE, keeping the
4300 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
4301 builder
.nelts_per_pattern ());
4302 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4303 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
4304 inv_builder
.quick_push (const1_rtx
);
4306 inv_builder
.quick_push (const0_rtx
);
4307 inv_builder
.finalize ();
4309 /* See if we can load the constant cheaply. */
4310 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
4314 /* EOR the result with an ELT_SIZE PTRUE. */
4315 rtx mask
= aarch64_ptrue_all (elt_size
);
4316 mask
= force_reg (VNx16BImode
, mask
);
4317 target
= aarch64_target_reg (target
, VNx16BImode
);
4318 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
4322 /* BUILDER is a constant predicate in which the index of every set bit
4323 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4324 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4325 register on success, otherwise return null. Use TARGET as the register
4326 if nonnull and convenient. */
4329 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
4330 unsigned int elt_size
,
4331 unsigned int permute_size
)
4333 /* We're going to split the constant into two new constants A and B,
4334 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4335 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4337 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4338 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4340 where _ indicates elements that will be discarded by the permute.
4342 First calculate the ELT_SIZEs for A and B. */
4343 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
4344 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
4345 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
4346 if (INTVAL (builder
.elt (i
)) != 0)
4348 if (i
& permute_size
)
4349 b_elt_size
|= i
- permute_size
;
4353 a_elt_size
&= -a_elt_size
;
4354 b_elt_size
&= -b_elt_size
;
4356 /* Now construct the vectors themselves. */
4357 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
4358 builder
.nelts_per_pattern ());
4359 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
4360 builder
.nelts_per_pattern ());
4361 unsigned int nelts
= builder
.encoded_nelts ();
4362 for (unsigned int i
= 0; i
< nelts
; ++i
)
4363 if (i
& (elt_size
- 1))
4365 a_builder
.quick_push (const0_rtx
);
4366 b_builder
.quick_push (const0_rtx
);
4368 else if ((i
& permute_size
) == 0)
4370 /* The A and B elements are significant. */
4371 a_builder
.quick_push (builder
.elt (i
));
4372 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
4376 /* The A and B elements are going to be discarded, so pick whatever
4377 is likely to give a nice constant. We are targeting element
4378 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4379 with the aim of each being a sequence of ones followed by
4380 a sequence of zeros. So:
4382 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4383 duplicate the last X_ELT_SIZE element, to extend the
4384 current sequence of ones or zeros.
4386 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4387 zero, so that the constant really does have X_ELT_SIZE and
4388 not a smaller size. */
4389 if (a_elt_size
> permute_size
)
4390 a_builder
.quick_push (const0_rtx
);
4392 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
4393 if (b_elt_size
> permute_size
)
4394 b_builder
.quick_push (const0_rtx
);
4396 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
4398 a_builder
.finalize ();
4399 b_builder
.finalize ();
4401 /* Try loading A into a register. */
4402 rtx_insn
*last
= get_last_insn ();
4403 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
4407 /* Try loading B into a register. */
4409 if (a_builder
!= b_builder
)
4411 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
4414 delete_insns_since (last
);
4419 /* Emit the TRN1 itself. */
4420 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
4421 target
= aarch64_target_reg (target
, mode
);
4422 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
4423 gen_lowpart (mode
, a
),
4424 gen_lowpart (mode
, b
)));
4428 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4429 constant in BUILDER into an SVE predicate register. Return the register
4430 on success, otherwise return null. Use TARGET for the register if
4431 nonnull and convenient.
4433 ALLOW_RECURSE_P is true if we can use methods that would call this
4434 function recursively. */
4437 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
4438 bool allow_recurse_p
)
4440 if (builder
.encoded_nelts () == 1)
4441 /* A PFALSE or a PTRUE .B ALL. */
4442 return aarch64_emit_set_immediate (target
, builder
);
4444 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
4445 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
4447 /* If we can load the constant using PTRUE, use it as-is. */
4448 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
4449 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
4450 return aarch64_emit_set_immediate (target
, builder
);
4452 /* Otherwise use WHILE to set the first VL bits. */
4453 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
4456 if (!allow_recurse_p
)
4459 /* Try inverting the vector in element size ELT_SIZE and then EORing
4460 the result with an ELT_SIZE PTRUE. */
4461 if (INTVAL (builder
.elt (0)) == 0)
4462 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
4466 /* Try using TRN1 to permute two simpler constants. */
4467 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
4468 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
4475 /* Return an SVE predicate register that contains the VNx16BImode
4476 constant in BUILDER, without going through the move expanders.
4478 The returned register can have whatever mode seems most natural
4479 given the contents of BUILDER. Use TARGET for the result if
4483 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
4485 /* Try loading the constant using pure predicate operations. */
4486 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
4489 /* Try forcing the constant to memory. */
4490 if (builder
.full_nelts ().is_constant ())
4491 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
4493 target
= aarch64_target_reg (target
, VNx16BImode
);
4494 emit_move_insn (target
, mem
);
4498 /* The last resort is to load the constant as an integer and then
4499 compare it against zero. Use -1 for set bits in order to increase
4500 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4501 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
4502 builder
.nelts_per_pattern ());
4503 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
4504 int_builder
.quick_push (INTVAL (builder
.elt (i
))
4505 ? constm1_rtx
: const0_rtx
);
4506 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
4507 int_builder
.build ());
4510 /* Set DEST to immediate IMM. */
4513 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
4515 machine_mode mode
= GET_MODE (dest
);
4517 /* Check on what type of symbol it is. */
4518 scalar_int_mode int_mode
;
4519 if ((GET_CODE (imm
) == SYMBOL_REF
4520 || GET_CODE (imm
) == LABEL_REF
4521 || GET_CODE (imm
) == CONST
4522 || GET_CODE (imm
) == CONST_POLY_INT
)
4523 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
4527 HOST_WIDE_INT const_offset
;
4528 enum aarch64_symbol_type sty
;
4530 /* If we have (const (plus symbol offset)), separate out the offset
4531 before we start classifying the symbol. */
4532 rtx base
= strip_offset (imm
, &offset
);
4534 /* We must always add an offset involving VL separately, rather than
4535 folding it into the relocation. */
4536 if (!offset
.is_constant (&const_offset
))
4540 aarch64_report_sve_required ();
4543 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4544 emit_insn (gen_rtx_SET (dest
, imm
));
4547 /* Do arithmetic on 32-bit values if the result is smaller
4549 if (partial_subreg_p (int_mode
, SImode
))
4551 /* It is invalid to do symbol calculations in modes
4552 narrower than SImode. */
4553 gcc_assert (base
== const0_rtx
);
4554 dest
= gen_lowpart (SImode
, dest
);
4557 if (base
!= const0_rtx
)
4559 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4560 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4561 NULL_RTX
, NULL_RTX
, false);
4564 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4565 dest
, NULL_RTX
, false);
4570 sty
= aarch64_classify_symbol (base
, const_offset
);
4573 case SYMBOL_FORCE_TO_MEM
:
4574 if (const_offset
!= 0
4575 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4577 gcc_assert (can_create_pseudo_p ());
4578 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4579 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4580 NULL_RTX
, NULL_RTX
, false);
4584 mem
= force_const_mem (ptr_mode
, imm
);
4587 /* If we aren't generating PC relative literals, then
4588 we need to expand the literal pool access carefully.
4589 This is something that needs to be done in a number
4590 of places, so could well live as a separate function. */
4591 if (!aarch64_pcrelative_literal_loads
)
4593 gcc_assert (can_create_pseudo_p ());
4594 base
= gen_reg_rtx (ptr_mode
);
4595 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4596 if (ptr_mode
!= Pmode
)
4597 base
= convert_memory_address (Pmode
, base
);
4598 mem
= gen_rtx_MEM (ptr_mode
, base
);
4601 if (int_mode
!= ptr_mode
)
4602 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4604 emit_insn (gen_rtx_SET (dest
, mem
));
4608 case SYMBOL_SMALL_TLSGD
:
4609 case SYMBOL_SMALL_TLSDESC
:
4610 case SYMBOL_SMALL_TLSIE
:
4611 case SYMBOL_SMALL_GOT_28K
:
4612 case SYMBOL_SMALL_GOT_4G
:
4613 case SYMBOL_TINY_GOT
:
4614 case SYMBOL_TINY_TLSIE
:
4615 if (const_offset
!= 0)
4617 gcc_assert(can_create_pseudo_p ());
4618 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4619 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4620 NULL_RTX
, NULL_RTX
, false);
4625 case SYMBOL_SMALL_ABSOLUTE
:
4626 case SYMBOL_TINY_ABSOLUTE
:
4627 case SYMBOL_TLSLE12
:
4628 case SYMBOL_TLSLE24
:
4629 case SYMBOL_TLSLE32
:
4630 case SYMBOL_TLSLE48
:
4631 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4639 if (!CONST_INT_P (imm
))
4641 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4643 /* Only the low bit of each .H, .S and .D element is defined,
4644 so we can set the upper bits to whatever we like. If the
4645 predicate is all-true in MODE, prefer to set all the undefined
4646 bits as well, so that we can share a single .B predicate for
4648 if (imm
== CONSTM1_RTX (mode
))
4649 imm
= CONSTM1_RTX (VNx16BImode
);
4651 /* All methods for constructing predicate modes wider than VNx16BI
4652 will set the upper bits of each element to zero. Expose this
4653 by moving such constants as a VNx16BI, so that all bits are
4654 significant and so that constants for different modes can be
4655 shared. The wider constant will still be available as a
4657 rtx_vector_builder builder
;
4658 if (aarch64_get_sve_pred_bits (builder
, imm
))
4660 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4662 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4667 if (GET_CODE (imm
) == HIGH
4668 || aarch64_simd_valid_immediate (imm
, NULL
))
4670 emit_insn (gen_rtx_SET (dest
, imm
));
4674 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4675 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4678 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4682 rtx mem
= force_const_mem (mode
, imm
);
4684 emit_move_insn (dest
, mem
);
4688 aarch64_internal_mov_immediate (dest
, imm
, true,
4689 as_a
<scalar_int_mode
> (mode
));
4692 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4693 that is known to contain PTRUE. */
4696 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4698 expand_operand ops
[3];
4699 machine_mode mode
= GET_MODE (dest
);
4700 create_output_operand (&ops
[0], dest
, mode
);
4701 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4702 create_input_operand (&ops
[2], src
, mode
);
4703 temporary_volatile_ok
v (true);
4704 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4707 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4708 operand is in memory. In this case we need to use the predicated LD1
4709 and ST1 instead of LDR and STR, both for correctness on big-endian
4710 targets and because LD1 and ST1 support a wider range of addressing modes.
4711 PRED_MODE is the mode of the predicate.
4713 See the comment at the head of aarch64-sve.md for details about the
4714 big-endian handling. */
4717 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4719 machine_mode mode
= GET_MODE (dest
);
4720 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4721 if (!register_operand (src
, mode
)
4722 && !register_operand (dest
, mode
))
4724 rtx tmp
= gen_reg_rtx (mode
);
4726 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4728 emit_move_insn (tmp
, src
);
4731 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4734 /* Called only on big-endian targets. See whether an SVE vector move
4735 from SRC to DEST is effectively a REV[BHW] instruction, because at
4736 least one operand is a subreg of an SVE vector that has wider or
4737 narrower elements. Return true and emit the instruction if so.
4741 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4743 represents a VIEW_CONVERT between the following vectors, viewed
4746 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4747 R1: { [0], [1], [2], [3], ... }
4749 The high part of lane X in R2 should therefore correspond to lane X*2
4750 of R1, but the register representations are:
4753 R2: ...... [1].high [1].low [0].high [0].low
4754 R1: ...... [3] [2] [1] [0]
4756 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4757 We therefore need a reverse operation to swap the high and low values
4760 This is purely an optimization. Without it we would spill the
4761 subreg operand to the stack in one mode and reload it in the
4762 other mode, which has the same effect as the REV. */
4765 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4767 gcc_assert (BYTES_BIG_ENDIAN
);
4768 if (GET_CODE (dest
) == SUBREG
)
4769 dest
= SUBREG_REG (dest
);
4770 if (GET_CODE (src
) == SUBREG
)
4771 src
= SUBREG_REG (src
);
4773 /* The optimization handles two single SVE REGs with different element
4777 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4778 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4779 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4780 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4783 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4784 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4785 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4787 emit_insn (gen_rtx_SET (dest
, unspec
));
4791 /* Return a copy of X with mode MODE, without changing its other
4792 attributes. Unlike gen_lowpart, this doesn't care whether the
4793 mode change is valid. */
4796 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4798 if (GET_MODE (x
) == mode
)
4801 x
= shallow_copy_rtx (x
);
4802 set_mode_and_regno (x
, mode
, REGNO (x
));
4806 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4807 stored in wider integer containers. */
4810 aarch64_sve_rev_unspec (machine_mode mode
)
4812 switch (GET_MODE_UNIT_SIZE (mode
))
4814 case 1: return UNSPEC_REVB
;
4815 case 2: return UNSPEC_REVH
;
4816 case 4: return UNSPEC_REVW
;
4821 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4825 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4827 /* Decide which REV operation we need. The mode with wider elements
4828 determines the mode of the operands and the mode with the narrower
4829 elements determines the reverse width. */
4830 machine_mode mode_with_wider_elts
= aarch64_sve_int_mode (GET_MODE (dest
));
4831 machine_mode mode_with_narrower_elts
= aarch64_sve_int_mode (GET_MODE (src
));
4832 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4833 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4834 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4836 unsigned int unspec
= aarch64_sve_rev_unspec (mode_with_narrower_elts
);
4837 machine_mode pred_mode
= aarch64_sve_pred_mode (mode_with_wider_elts
);
4839 /* Get the operands in the appropriate modes and emit the instruction. */
4840 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4841 dest
= aarch64_replace_reg_mode (dest
, mode_with_wider_elts
);
4842 src
= aarch64_replace_reg_mode (src
, mode_with_wider_elts
);
4843 emit_insn (gen_aarch64_pred (unspec
, mode_with_wider_elts
,
4848 aarch64_function_ok_for_sibcall (tree
, tree exp
)
4850 if (crtl
->abi
->id () != expr_callee_abi (exp
).id ())
4856 /* Implement TARGET_PASS_BY_REFERENCE. */
4859 aarch64_pass_by_reference (cumulative_args_t pcum_v
,
4860 const function_arg_info
&arg
)
4862 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4864 machine_mode dummymode
;
4867 unsigned int num_zr
, num_pr
;
4868 if (arg
.type
&& aarch64_sve::builtin_type_p (arg
.type
, &num_zr
, &num_pr
))
4870 if (pcum
&& !pcum
->silent_p
&& !TARGET_SVE
)
4871 /* We can't gracefully recover at this point, so make this a
4873 fatal_error (input_location
, "arguments of type %qT require"
4874 " the SVE ISA extension", arg
.type
);
4876 /* Variadic SVE types are passed by reference. Normal non-variadic
4877 arguments are too if we've run out of registers. */
4879 || pcum
->aapcs_nvrn
+ num_zr
> NUM_FP_ARG_REGS
4880 || pcum
->aapcs_nprn
+ num_pr
> NUM_PR_ARG_REGS
);
4883 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4884 if (arg
.mode
== BLKmode
&& arg
.type
)
4885 size
= int_size_in_bytes (arg
.type
);
4887 /* No frontends can create types with variable-sized modes, so we
4888 shouldn't be asked to pass or return them. */
4889 size
= GET_MODE_SIZE (arg
.mode
).to_constant ();
4891 /* Aggregates are passed by reference based on their size. */
4892 if (arg
.aggregate_type_p ())
4893 size
= int_size_in_bytes (arg
.type
);
4895 /* Variable sized arguments are always returned by reference. */
4899 /* Can this be a candidate to be passed in fp/simd register(s)? */
4900 if (aarch64_vfp_is_call_or_return_candidate (arg
.mode
, arg
.type
,
4905 /* Arguments which are variable sized or larger than 2 registers are
4906 passed by reference unless they are a homogenous floating point
4908 return size
> 2 * UNITS_PER_WORD
;
4911 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4913 aarch64_return_in_msb (const_tree valtype
)
4915 machine_mode dummy_mode
;
4918 /* Never happens in little-endian mode. */
4919 if (!BYTES_BIG_ENDIAN
)
4922 /* Only composite types smaller than or equal to 16 bytes can
4923 be potentially returned in registers. */
4924 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4925 || int_size_in_bytes (valtype
) <= 0
4926 || int_size_in_bytes (valtype
) > 16)
4929 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4930 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4931 is always passed/returned in the least significant bits of fp/simd
4933 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4934 &dummy_mode
, &dummy_int
, NULL
))
4940 /* Subroutine of aarch64_function_value. MODE is the mode of the argument
4941 after promotion, and after partial SVE types have been replaced by
4942 their integer equivalents. */
4944 aarch64_function_value_1 (const_tree type
, machine_mode mode
)
4946 unsigned int num_zr
, num_pr
;
4947 if (type
&& aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
))
4949 /* Don't raise an error here if we're called when SVE is disabled,
4950 since this is really just a query function. Other code must
4951 do that where appropriate. */
4952 mode
= TYPE_MODE_RAW (type
);
4953 gcc_assert (VECTOR_MODE_P (mode
)
4954 && (!TARGET_SVE
|| aarch64_sve_mode_p (mode
)));
4956 if (num_zr
> 0 && num_pr
== 0)
4957 return gen_rtx_REG (mode
, V0_REGNUM
);
4959 if (num_zr
== 0 && num_pr
== 1)
4960 return gen_rtx_REG (mode
, P0_REGNUM
);
4965 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4966 returned in memory, not by value. */
4967 gcc_assert (!aarch64_sve_mode_p (mode
));
4969 if (aarch64_return_in_msb (type
))
4971 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4973 if (size
% UNITS_PER_WORD
!= 0)
4975 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4976 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4981 machine_mode ag_mode
;
4982 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4983 &ag_mode
, &count
, NULL
))
4985 if (!aarch64_composite_type_p (type
, mode
))
4987 gcc_assert (count
== 1 && mode
== ag_mode
);
4988 return gen_rtx_REG (mode
, V0_REGNUM
);
4995 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4996 for (i
= 0; i
< count
; i
++)
4998 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4999 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
5000 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5001 XVECEXP (par
, 0, i
) = tmp
;
5007 return gen_rtx_REG (mode
, R0_REGNUM
);
5010 /* Implement TARGET_FUNCTION_VALUE.
5011 Define how to find the value returned by a function. */
5014 aarch64_function_value (const_tree type
, const_tree func
,
5015 bool outgoing ATTRIBUTE_UNUSED
)
5020 mode
= TYPE_MODE (type
);
5021 if (INTEGRAL_TYPE_P (type
))
5022 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
5024 /* Vector types can acquire a partial SVE mode using things like
5025 __attribute__((vector_size(N))), and this is potentially useful.
5026 However, the choice of mode doesn't affect the type's ABI identity,
5027 so we should treat the types as though they had the associated
5028 integer mode, just like they did before SVE was introduced.
5030 We know that the vector must be 128 bits or smaller, otherwise we'd
5031 have returned it in memory instead. */
5032 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5033 if ((vec_flags
& VEC_ANY_SVE
) && (vec_flags
& VEC_PARTIAL
))
5035 scalar_int_mode int_mode
= int_mode_for_mode (mode
).require ();
5036 rtx reg
= aarch64_function_value_1 (type
, int_mode
);
5037 /* Vector types are never returned in the MSB and are never split. */
5038 gcc_assert (REG_P (reg
) && GET_MODE (reg
) == int_mode
);
5039 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5040 return gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, pair
));
5043 return aarch64_function_value_1 (type
, mode
);
5046 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5047 Return true if REGNO is the number of a hard register in which the values
5048 of called function may come back. */
5051 aarch64_function_value_regno_p (const unsigned int regno
)
5053 /* Maximum of 16 bytes can be returned in the general registers. Examples
5054 of 16-byte return values are: 128-bit integers and 16-byte small
5055 structures (excluding homogeneous floating-point aggregates). */
5056 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
5059 /* Up to four fp/simd registers can return a function value, e.g. a
5060 homogeneous floating-point aggregate having four members. */
5061 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
5062 return TARGET_FLOAT
;
5067 /* Implement TARGET_RETURN_IN_MEMORY.
5069 If the type T of the result of a function is such that
5071 would require that arg be passed as a value in a register (or set of
5072 registers) according to the parameter passing rules, then the result
5073 is returned in the same registers as would be used for such an
5077 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
5080 machine_mode ag_mode
;
5083 if (!AGGREGATE_TYPE_P (type
)
5084 && TREE_CODE (type
) != COMPLEX_TYPE
5085 && TREE_CODE (type
) != VECTOR_TYPE
)
5086 /* Simple scalar types always returned in registers. */
5089 unsigned int num_zr
, num_pr
;
5090 if (type
&& aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
))
5092 /* All SVE types we support fit in registers. For example, it isn't
5093 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5095 gcc_assert (num_zr
<= NUM_FP_ARG_REGS
&& num_pr
<= NUM_PR_ARG_REGS
);
5099 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
5106 /* Types larger than 2 registers returned in memory. */
5107 size
= int_size_in_bytes (type
);
5108 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
5112 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
5113 const_tree type
, int *nregs
)
5115 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5116 return aarch64_vfp_is_call_or_return_candidate (mode
,
5118 &pcum
->aapcs_vfp_rmode
,
5123 /* Given MODE and TYPE of a function argument, return the alignment in
5124 bits. The idea is to suppress any stronger alignment requested by
5125 the user and opt for the natural alignment (specified in AAPCS64 \S
5126 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5127 calculated in versions of GCC prior to GCC-9. This is a helper
5128 function for local use only. */
5131 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
5136 return GET_MODE_ALIGNMENT (mode
);
5138 if (integer_zerop (TYPE_SIZE (type
)))
5141 gcc_assert (TYPE_MODE (type
) == mode
);
5143 if (!AGGREGATE_TYPE_P (type
))
5144 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
5146 if (TREE_CODE (type
) == ARRAY_TYPE
)
5147 return TYPE_ALIGN (TREE_TYPE (type
));
5149 unsigned int alignment
= 0;
5150 unsigned int bitfield_alignment
= 0;
5151 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
5152 if (TREE_CODE (field
) == FIELD_DECL
)
5154 alignment
= std::max (alignment
, DECL_ALIGN (field
));
5155 if (DECL_BIT_FIELD_TYPE (field
))
5157 = std::max (bitfield_alignment
,
5158 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
5161 if (bitfield_alignment
> alignment
)
5164 return bitfield_alignment
;
5170 /* Layout a function argument according to the AAPCS64 rules. The rule
5171 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5172 mode that was originally given to us by the target hook, whereas the
5173 mode in ARG might be the result of replacing partial SVE modes with
5174 the equivalent integer mode. */
5177 aarch64_layout_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
,
5178 machine_mode orig_mode
)
5180 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5181 tree type
= arg
.type
;
5182 machine_mode mode
= arg
.mode
;
5183 int ncrn
, nvrn
, nregs
;
5184 bool allocate_ncrn
, allocate_nvrn
;
5188 /* We need to do this once per argument. */
5189 if (pcum
->aapcs_arg_processed
)
5192 /* Vector types can acquire a partial SVE mode using things like
5193 __attribute__((vector_size(N))), and this is potentially useful.
5194 However, the choice of mode doesn't affect the type's ABI identity,
5195 so we should treat the types as though they had the associated
5196 integer mode, just like they did before SVE was introduced.
5198 We know that the vector must be 128 bits or smaller, otherwise we'd
5199 have passed it by reference instead. */
5200 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5201 if ((vec_flags
& VEC_ANY_SVE
) && (vec_flags
& VEC_PARTIAL
))
5203 function_arg_info tmp_arg
= arg
;
5204 tmp_arg
.mode
= int_mode_for_mode (mode
).require ();
5205 aarch64_layout_arg (pcum_v
, tmp_arg
, orig_mode
);
5206 if (rtx reg
= pcum
->aapcs_reg
)
5208 gcc_assert (REG_P (reg
) && GET_MODE (reg
) == tmp_arg
.mode
);
5209 rtx pair
= gen_rtx_EXPR_LIST (VOIDmode
, reg
, const0_rtx
);
5210 pcum
->aapcs_reg
= gen_rtx_PARALLEL (mode
, gen_rtvec (1, pair
));
5215 pcum
->aapcs_arg_processed
= true;
5217 unsigned int num_zr
, num_pr
;
5218 if (type
&& aarch64_sve::builtin_type_p (type
, &num_zr
, &num_pr
))
5220 /* The PCS says that it is invalid to pass an SVE value to an
5221 unprototyped function. There is no ABI-defined location we
5222 can return in this case, so we have no real choice but to raise
5223 an error immediately, even though this is only a query function. */
5224 if (arg
.named
&& pcum
->pcs_variant
!= ARM_PCS_SVE
)
5226 gcc_assert (!pcum
->silent_p
);
5227 error ("SVE type %qT cannot be passed to an unprototyped function",
5229 /* Avoid repeating the message, and avoid tripping the assert
5231 pcum
->pcs_variant
= ARM_PCS_SVE
;
5234 /* We would have converted the argument into pass-by-reference
5235 form if it didn't fit in registers. */
5236 pcum
->aapcs_nextnvrn
= pcum
->aapcs_nvrn
+ num_zr
;
5237 pcum
->aapcs_nextnprn
= pcum
->aapcs_nprn
+ num_pr
;
5238 gcc_assert (arg
.named
5239 && pcum
->pcs_variant
== ARM_PCS_SVE
5240 && aarch64_sve_mode_p (mode
)
5241 && pcum
->aapcs_nextnvrn
<= NUM_FP_ARG_REGS
5242 && pcum
->aapcs_nextnprn
<= NUM_PR_ARG_REGS
);
5244 if (num_zr
> 0 && num_pr
== 0)
5245 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ pcum
->aapcs_nvrn
);
5246 else if (num_zr
== 0 && num_pr
== 1)
5247 pcum
->aapcs_reg
= gen_rtx_REG (mode
, P0_REGNUM
+ pcum
->aapcs_nprn
);
5253 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5254 passed by reference, not by value. */
5255 gcc_assert (!aarch64_sve_mode_p (mode
));
5257 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5259 size
= int_size_in_bytes (type
);
5261 /* No frontends can create types with variable-sized modes, so we
5262 shouldn't be asked to pass or return them. */
5263 size
= GET_MODE_SIZE (mode
).to_constant ();
5264 size
= ROUND_UP (size
, UNITS_PER_WORD
);
5266 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
5267 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
5272 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5273 The following code thus handles passing by SIMD/FP registers first. */
5275 nvrn
= pcum
->aapcs_nvrn
;
5277 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5278 and homogenous short-vector aggregates (HVA). */
5281 if (!pcum
->silent_p
&& !TARGET_FLOAT
)
5282 aarch64_err_no_fpadvsimd (mode
);
5284 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
5286 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
5287 if (!aarch64_composite_type_p (type
, mode
))
5289 gcc_assert (nregs
== 1);
5290 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
5296 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5297 for (i
= 0; i
< nregs
; i
++)
5299 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
5300 V0_REGNUM
+ nvrn
+ i
);
5301 rtx offset
= gen_int_mode
5302 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
5303 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
5304 XVECEXP (par
, 0, i
) = tmp
;
5306 pcum
->aapcs_reg
= par
;
5312 /* C.3 NSRN is set to 8. */
5313 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
5318 ncrn
= pcum
->aapcs_ncrn
;
5319 nregs
= size
/ UNITS_PER_WORD
;
5321 /* C6 - C9. though the sign and zero extension semantics are
5322 handled elsewhere. This is the case where the argument fits
5323 entirely general registers. */
5324 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
5326 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
5328 /* C.8 if the argument has an alignment of 16 then the NGRN is
5329 rounded up to the next even number. */
5332 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5333 comparison is there because for > 16 * BITS_PER_UNIT
5334 alignment nregs should be > 2 and therefore it should be
5335 passed by reference rather than value. */
5336 && (aarch64_function_arg_alignment (orig_mode
, type
, &abi_break
)
5337 == 16 * BITS_PER_UNIT
))
5339 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5340 inform (input_location
, "parameter passing for argument of type "
5341 "%qT changed in GCC 9.1", type
);
5343 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
5346 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5347 A reg is still generated for it, but the caller should be smart
5348 enough not to use it. */
5349 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
5350 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
5356 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
5357 for (i
= 0; i
< nregs
; i
++)
5359 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
5360 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
5361 GEN_INT (i
* UNITS_PER_WORD
));
5362 XVECEXP (par
, 0, i
) = tmp
;
5364 pcum
->aapcs_reg
= par
;
5367 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
5372 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
5374 /* The argument is passed on stack; record the needed number of words for
5375 this argument and align the total size if necessary. */
5377 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
5379 if (aarch64_function_arg_alignment (orig_mode
, type
, &abi_break
)
5380 == 16 * BITS_PER_UNIT
)
5382 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
5383 if (pcum
->aapcs_stack_size
!= new_size
)
5385 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
5386 inform (input_location
, "parameter passing for argument of type "
5387 "%qT changed in GCC 9.1", type
);
5388 pcum
->aapcs_stack_size
= new_size
;
5394 /* Implement TARGET_FUNCTION_ARG. */
5397 aarch64_function_arg (cumulative_args_t pcum_v
, const function_arg_info
&arg
)
5399 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5400 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5401 || pcum
->pcs_variant
== ARM_PCS_SIMD
5402 || pcum
->pcs_variant
== ARM_PCS_SVE
);
5404 if (arg
.end_marker_p ())
5405 return gen_int_mode (pcum
->pcs_variant
, DImode
);
5407 aarch64_layout_arg (pcum_v
, arg
, arg
.mode
);
5408 return pcum
->aapcs_reg
;
5412 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
5414 rtx libname ATTRIBUTE_UNUSED
,
5415 const_tree fndecl ATTRIBUTE_UNUSED
,
5416 unsigned n_named ATTRIBUTE_UNUSED
,
5419 pcum
->aapcs_ncrn
= 0;
5420 pcum
->aapcs_nvrn
= 0;
5421 pcum
->aapcs_nprn
= 0;
5422 pcum
->aapcs_nextncrn
= 0;
5423 pcum
->aapcs_nextnvrn
= 0;
5424 pcum
->aapcs_nextnprn
= 0;
5426 pcum
->pcs_variant
= (arm_pcs
) fntype_abi (fntype
).id ();
5428 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
5429 pcum
->aapcs_reg
= NULL_RTX
;
5430 pcum
->aapcs_arg_processed
= false;
5431 pcum
->aapcs_stack_words
= 0;
5432 pcum
->aapcs_stack_size
= 0;
5433 pcum
->silent_p
= silent_p
;
5437 && fndecl
&& TREE_PUBLIC (fndecl
)
5438 && fntype
&& fntype
!= error_mark_node
)
5440 const_tree type
= TREE_TYPE (fntype
);
5441 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
5442 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
5443 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
5444 &mode
, &nregs
, NULL
))
5445 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
5450 && pcum
->pcs_variant
== ARM_PCS_SVE
)
5452 /* We can't gracefully recover at this point, so make this a
5455 fatal_error (input_location
, "%qE requires the SVE ISA extension",
5458 fatal_error (input_location
, "calls to functions of type %qT require"
5459 " the SVE ISA extension", fntype
);
5464 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
5465 const function_arg_info
&arg
)
5467 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
5468 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
5469 || pcum
->pcs_variant
== ARM_PCS_SIMD
5470 || pcum
->pcs_variant
== ARM_PCS_SVE
)
5472 aarch64_layout_arg (pcum_v
, arg
, arg
.mode
);
5473 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
5474 != (pcum
->aapcs_stack_words
!= 0));
5475 pcum
->aapcs_arg_processed
= false;
5476 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
5477 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
5478 pcum
->aapcs_nprn
= pcum
->aapcs_nextnprn
;
5479 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
5480 pcum
->aapcs_stack_words
= 0;
5481 pcum
->aapcs_reg
= NULL_RTX
;
5486 aarch64_function_arg_regno_p (unsigned regno
)
5488 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
5489 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
5492 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5493 PARM_BOUNDARY bits of alignment, but will be given anything up
5494 to STACK_BOUNDARY bits if the type requires it. This makes sure
5495 that both before and after the layout of each argument, the Next
5496 Stacked Argument Address (NSAA) will have a minimum alignment of
5500 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
5503 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
5505 if (abi_break
& warn_psabi
)
5506 inform (input_location
, "parameter passing for argument of type "
5507 "%qT changed in GCC 9.1", type
);
5509 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
5512 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5514 static fixed_size_mode
5515 aarch64_get_reg_raw_mode (int regno
)
5517 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
5518 /* Don't use the SVE part of the register for __builtin_apply and
5519 __builtin_return. The SVE registers aren't used by the normal PCS,
5520 so using them there would be a waste of time. The PCS extensions
5521 for SVE types are fundamentally incompatible with the
5522 __builtin_return/__builtin_apply interface. */
5523 return as_a
<fixed_size_mode
> (V16QImode
);
5524 return default_get_reg_raw_mode (regno
);
5527 /* Implement TARGET_FUNCTION_ARG_PADDING.
5529 Small aggregate types are placed in the lowest memory address.
5531 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5533 static pad_direction
5534 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
5536 /* On little-endian targets, the least significant byte of every stack
5537 argument is passed at the lowest byte address of the stack slot. */
5538 if (!BYTES_BIG_ENDIAN
)
5541 /* Otherwise, integral, floating-point and pointer types are padded downward:
5542 the least significant byte of a stack argument is passed at the highest
5543 byte address of the stack slot. */
5545 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
5546 || POINTER_TYPE_P (type
))
5547 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
5548 return PAD_DOWNWARD
;
5550 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5554 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5556 It specifies padding for the last (may also be the only)
5557 element of a block move between registers and memory. If
5558 assuming the block is in the memory, padding upward means that
5559 the last element is padded after its highest significant byte,
5560 while in downward padding, the last element is padded at the
5561 its least significant byte side.
5563 Small aggregates and small complex types are always padded
5566 We don't need to worry about homogeneous floating-point or
5567 short-vector aggregates; their move is not affected by the
5568 padding direction determined here. Regardless of endianness,
5569 each element of such an aggregate is put in the least
5570 significant bits of a fp/simd register.
5572 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5573 register has useful data, and return the opposite if the most
5574 significant byte does. */
5577 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
5578 bool first ATTRIBUTE_UNUSED
)
5581 /* Small composite types are always padded upward. */
5582 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
5586 size
= int_size_in_bytes (type
);
5588 /* No frontends can create types with variable-sized modes, so we
5589 shouldn't be asked to pass or return them. */
5590 size
= GET_MODE_SIZE (mode
).to_constant ();
5591 if (size
< 2 * UNITS_PER_WORD
)
5595 /* Otherwise, use the default padding. */
5596 return !BYTES_BIG_ENDIAN
;
5599 static scalar_int_mode
5600 aarch64_libgcc_cmp_return_mode (void)
5605 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5607 /* We use the 12-bit shifted immediate arithmetic instructions so values
5608 must be multiple of (1 << 12), i.e. 4096. */
5609 #define ARITH_FACTOR 4096
5611 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5612 #error Cannot use simple address calculation for stack probing
5615 /* The pair of scratch registers used for stack probing. */
5616 #define PROBE_STACK_FIRST_REG R9_REGNUM
5617 #define PROBE_STACK_SECOND_REG R10_REGNUM
5619 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5620 inclusive. These are offsets from the current stack pointer. */
5623 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
5626 if (!poly_size
.is_constant (&size
))
5628 sorry ("stack probes for SVE frames");
5632 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
5634 /* See the same assertion on PROBE_INTERVAL above. */
5635 gcc_assert ((first
% ARITH_FACTOR
) == 0);
5637 /* See if we have a constant small number of probes to generate. If so,
5638 that's the easy case. */
5639 if (size
<= PROBE_INTERVAL
)
5641 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
5643 emit_set_insn (reg1
,
5644 plus_constant (Pmode
,
5645 stack_pointer_rtx
, -(first
+ base
)));
5646 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
5649 /* The run-time loop is made up of 8 insns in the generic case while the
5650 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5651 else if (size
<= 4 * PROBE_INTERVAL
)
5653 HOST_WIDE_INT i
, rem
;
5655 emit_set_insn (reg1
,
5656 plus_constant (Pmode
,
5658 -(first
+ PROBE_INTERVAL
)));
5659 emit_stack_probe (reg1
);
5661 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5662 it exceeds SIZE. If only two probes are needed, this will not
5663 generate any code. Then probe at FIRST + SIZE. */
5664 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
5666 emit_set_insn (reg1
,
5667 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
5668 emit_stack_probe (reg1
);
5671 rem
= size
- (i
- PROBE_INTERVAL
);
5674 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5676 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
5677 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
5680 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
5683 /* Otherwise, do the same as above, but in a loop. Note that we must be
5684 extra careful with variables wrapping around because we might be at
5685 the very top (or the very bottom) of the address space and we have
5686 to be able to handle this case properly; in particular, we use an
5687 equality test for the loop condition. */
5690 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
5692 /* Step 1: round SIZE to the previous multiple of the interval. */
5694 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5697 /* Step 2: compute initial and final value of the loop counter. */
5699 /* TEST_ADDR = SP + FIRST. */
5700 emit_set_insn (reg1
,
5701 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5703 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5704 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5705 if (! aarch64_uimm12_shift (adjustment
))
5707 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5709 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5712 emit_set_insn (reg2
,
5713 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5719 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5722 while (TEST_ADDR != LAST_ADDR)
5724 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5725 until it is equal to ROUNDED_SIZE. */
5727 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5730 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5731 that SIZE is equal to ROUNDED_SIZE. */
5733 if (size
!= rounded_size
)
5735 HOST_WIDE_INT rem
= size
- rounded_size
;
5739 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5741 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5742 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5745 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5749 /* Make sure nothing is scheduled before we are done. */
5750 emit_insn (gen_blockage ());
5753 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5754 absolute addresses. */
5757 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5759 static int labelno
= 0;
5763 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5766 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5768 HOST_WIDE_INT stack_clash_probe_interval
5769 = 1 << param_stack_clash_protection_guard_size
;
5771 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5773 HOST_WIDE_INT interval
;
5774 if (flag_stack_clash_protection
)
5775 interval
= stack_clash_probe_interval
;
5777 interval
= PROBE_INTERVAL
;
5779 gcc_assert (aarch64_uimm12_shift (interval
));
5780 xops
[1] = GEN_INT (interval
);
5782 output_asm_insn ("sub\t%0, %0, %1", xops
);
5784 /* If doing stack clash protection then we probe up by the ABI specified
5785 amount. We do this because we're dropping full pages at a time in the
5786 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5787 if (flag_stack_clash_protection
)
5788 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5790 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5792 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5793 by this amount for each iteration. */
5794 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5796 /* Test if TEST_ADDR == LAST_ADDR. */
5798 output_asm_insn ("cmp\t%0, %1", xops
);
5801 fputs ("\tb.ne\t", asm_out_file
);
5802 assemble_name_raw (asm_out_file
, loop_lab
);
5803 fputc ('\n', asm_out_file
);
5808 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5809 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5810 of GUARD_SIZE. When a probe is emitted it is done at most
5811 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5812 at most MIN_PROBE_THRESHOLD. By the end of this function
5813 BASE = BASE - ADJUSTMENT. */
5816 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5817 rtx min_probe_threshold
, rtx guard_size
)
5819 /* This function is not allowed to use any instruction generation function
5820 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5821 so instead emit the code you want using output_asm_insn. */
5822 gcc_assert (flag_stack_clash_protection
);
5823 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5824 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5826 /* The minimum required allocation before the residual requires probing. */
5827 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5829 /* Clamp the value down to the nearest value that can be used with a cmp. */
5830 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5831 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5833 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5834 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5836 static int labelno
= 0;
5837 char loop_start_lab
[32];
5838 char loop_end_lab
[32];
5841 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5842 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5844 /* Emit loop start label. */
5845 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5847 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5848 xops
[0] = adjustment
;
5849 xops
[1] = probe_offset_value_rtx
;
5850 output_asm_insn ("cmp\t%0, %1", xops
);
5852 /* Branch to end if not enough adjustment to probe. */
5853 fputs ("\tb.lt\t", asm_out_file
);
5854 assemble_name_raw (asm_out_file
, loop_end_lab
);
5855 fputc ('\n', asm_out_file
);
5857 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5859 xops
[1] = probe_offset_value_rtx
;
5860 output_asm_insn ("sub\t%0, %0, %1", xops
);
5862 /* Probe at BASE. */
5863 xops
[1] = const0_rtx
;
5864 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5866 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5867 xops
[0] = adjustment
;
5868 xops
[1] = probe_offset_value_rtx
;
5869 output_asm_insn ("sub\t%0, %0, %1", xops
);
5871 /* Branch to start if still more bytes to allocate. */
5872 fputs ("\tb\t", asm_out_file
);
5873 assemble_name_raw (asm_out_file
, loop_start_lab
);
5874 fputc ('\n', asm_out_file
);
5876 /* No probe leave. */
5877 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5879 /* BASE = BASE - ADJUSTMENT. */
5881 xops
[1] = adjustment
;
5882 output_asm_insn ("sub\t%0, %0, %1", xops
);
5886 /* Determine whether a frame chain needs to be generated. */
5888 aarch64_needs_frame_chain (void)
5890 /* Force a frame chain for EH returns so the return address is at FP+8. */
5891 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5894 /* A leaf function cannot have calls or write LR. */
5895 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5897 /* Don't use a frame chain in leaf functions if leaf frame pointers
5899 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5902 return aarch64_use_frame_pointer
;
5905 /* Mark the registers that need to be saved by the callee and calculate
5906 the size of the callee-saved registers area and frame record (both FP
5907 and LR may be omitted). */
5909 aarch64_layout_frame (void)
5911 poly_int64 offset
= 0;
5912 int regno
, last_fp_reg
= INVALID_REGNUM
;
5913 machine_mode vector_save_mode
= aarch64_reg_save_mode (V8_REGNUM
);
5914 poly_int64 vector_save_size
= GET_MODE_SIZE (vector_save_mode
);
5915 bool frame_related_fp_reg_p
= false;
5916 aarch64_frame
&frame
= cfun
->machine
->frame
;
5918 frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5920 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5921 the mid-end is doing. */
5922 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5924 #define SLOT_NOT_REQUIRED (-2)
5925 #define SLOT_REQUIRED (-1)
5927 frame
.wb_candidate1
= INVALID_REGNUM
;
5928 frame
.wb_candidate2
= INVALID_REGNUM
;
5929 frame
.spare_pred_reg
= INVALID_REGNUM
;
5931 /* First mark all the registers that really need to be saved... */
5932 for (regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5933 frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5935 /* ... that includes the eh data registers (if needed)... */
5936 if (crtl
->calls_eh_return
)
5937 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5938 frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)] = SLOT_REQUIRED
;
5940 /* ... and any callee saved register that dataflow says is live. */
5941 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5942 if (df_regs_ever_live_p (regno
)
5943 && !fixed_regs
[regno
]
5944 && (regno
== R30_REGNUM
5945 || !crtl
->abi
->clobbers_full_reg_p (regno
)))
5946 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5948 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5949 if (df_regs_ever_live_p (regno
)
5950 && !fixed_regs
[regno
]
5951 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5953 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5954 last_fp_reg
= regno
;
5955 if (aarch64_emit_cfi_for_reg_p (regno
))
5956 frame_related_fp_reg_p
= true;
5959 /* Big-endian SVE frames need a spare predicate register in order
5960 to save Z8-Z15. Decide which register they should use. Prefer
5961 an unused argument register if possible, so that we don't force P4
5962 to be saved unnecessarily. */
5963 if (frame_related_fp_reg_p
5964 && crtl
->abi
->id () == ARM_PCS_SVE
5965 && BYTES_BIG_ENDIAN
)
5967 bitmap live1
= df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun
));
5968 bitmap live2
= df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun
));
5969 for (regno
= P0_REGNUM
; regno
<= P7_REGNUM
; regno
++)
5970 if (!bitmap_bit_p (live1
, regno
) && !bitmap_bit_p (live2
, regno
))
5972 gcc_assert (regno
<= P7_REGNUM
);
5973 frame
.spare_pred_reg
= regno
;
5974 df_set_regs_ever_live (regno
, true);
5977 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
5978 if (df_regs_ever_live_p (regno
)
5979 && !fixed_regs
[regno
]
5980 && !crtl
->abi
->clobbers_full_reg_p (regno
))
5981 frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5983 /* With stack-clash, LR must be saved in non-leaf functions. */
5984 gcc_assert (crtl
->is_leaf
5985 || maybe_ne (frame
.reg_offset
[R30_REGNUM
], SLOT_NOT_REQUIRED
));
5987 /* Now assign stack slots for the registers. Start with the predicate
5988 registers, since predicate LDR and STR have a relatively small
5989 offset range. These saves happen below the hard frame pointer. */
5990 for (regno
= P0_REGNUM
; regno
<= P15_REGNUM
; regno
++)
5991 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
5993 frame
.reg_offset
[regno
] = offset
;
5994 offset
+= BYTES_PER_SVE_PRED
;
5997 /* We save a maximum of 8 predicate registers, and since vector
5998 registers are 8 times the size of a predicate register, all the
5999 saved predicates fit within a single vector. Doing this also
6000 rounds the offset to a 128-bit boundary. */
6001 if (maybe_ne (offset
, 0))
6003 gcc_assert (known_le (offset
, vector_save_size
));
6004 offset
= vector_save_size
;
6007 /* If we need to save any SVE vector registers, add them next. */
6008 if (last_fp_reg
!= (int) INVALID_REGNUM
&& crtl
->abi
->id () == ARM_PCS_SVE
)
6009 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6010 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6012 frame
.reg_offset
[regno
] = offset
;
6013 offset
+= vector_save_size
;
6016 /* OFFSET is now the offset of the hard frame pointer from the bottom
6017 of the callee save area. */
6018 bool saves_below_hard_fp_p
= maybe_ne (offset
, 0);
6019 frame
.below_hard_fp_saved_regs_size
= offset
;
6020 if (frame
.emit_frame_chain
)
6022 /* FP and LR are placed in the linkage record. */
6023 frame
.reg_offset
[R29_REGNUM
] = offset
;
6024 frame
.wb_candidate1
= R29_REGNUM
;
6025 frame
.reg_offset
[R30_REGNUM
] = offset
+ UNITS_PER_WORD
;
6026 frame
.wb_candidate2
= R30_REGNUM
;
6027 offset
+= 2 * UNITS_PER_WORD
;
6030 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
6031 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6033 frame
.reg_offset
[regno
] = offset
;
6034 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6035 frame
.wb_candidate1
= regno
;
6036 else if (frame
.wb_candidate2
== INVALID_REGNUM
)
6037 frame
.wb_candidate2
= regno
;
6038 offset
+= UNITS_PER_WORD
;
6041 poly_int64 max_int_offset
= offset
;
6042 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6043 bool has_align_gap
= maybe_ne (offset
, max_int_offset
);
6045 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
6046 if (known_eq (frame
.reg_offset
[regno
], SLOT_REQUIRED
))
6048 /* If there is an alignment gap between integer and fp callee-saves,
6049 allocate the last fp register to it if possible. */
6050 if (regno
== last_fp_reg
6052 && known_eq (vector_save_size
, 8)
6053 && multiple_p (offset
, 16))
6055 frame
.reg_offset
[regno
] = max_int_offset
;
6059 frame
.reg_offset
[regno
] = offset
;
6060 if (frame
.wb_candidate1
== INVALID_REGNUM
)
6061 frame
.wb_candidate1
= regno
;
6062 else if (frame
.wb_candidate2
== INVALID_REGNUM
6063 && frame
.wb_candidate1
>= V0_REGNUM
)
6064 frame
.wb_candidate2
= regno
;
6065 offset
+= vector_save_size
;
6068 offset
= aligned_upper_bound (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
6070 frame
.saved_regs_size
= offset
;
6072 poly_int64 varargs_and_saved_regs_size
= offset
+ frame
.saved_varargs_size
;
6074 poly_int64 above_outgoing_args
6075 = aligned_upper_bound (varargs_and_saved_regs_size
6076 + get_frame_size (),
6077 STACK_BOUNDARY
/ BITS_PER_UNIT
);
6079 frame
.hard_fp_offset
6080 = above_outgoing_args
- frame
.below_hard_fp_saved_regs_size
;
6082 /* Both these values are already aligned. */
6083 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
6084 STACK_BOUNDARY
/ BITS_PER_UNIT
));
6085 frame
.frame_size
= above_outgoing_args
+ crtl
->outgoing_args_size
;
6087 frame
.locals_offset
= frame
.saved_varargs_size
;
6089 frame
.initial_adjust
= 0;
6090 frame
.final_adjust
= 0;
6091 frame
.callee_adjust
= 0;
6092 frame
.sve_callee_adjust
= 0;
6093 frame
.callee_offset
= 0;
6095 HOST_WIDE_INT max_push_offset
= 0;
6096 if (frame
.wb_candidate2
!= INVALID_REGNUM
)
6097 max_push_offset
= 512;
6098 else if (frame
.wb_candidate1
!= INVALID_REGNUM
)
6099 max_push_offset
= 256;
6101 HOST_WIDE_INT const_size
, const_outgoing_args_size
, const_fp_offset
;
6102 HOST_WIDE_INT const_saved_regs_size
;
6103 if (frame
.frame_size
.is_constant (&const_size
)
6104 && const_size
< max_push_offset
6105 && known_eq (frame
.hard_fp_offset
, const_size
))
6107 /* Simple, small frame with no outgoing arguments:
6109 stp reg1, reg2, [sp, -frame_size]!
6110 stp reg3, reg4, [sp, 16] */
6111 frame
.callee_adjust
= const_size
;
6113 else if (crtl
->outgoing_args_size
.is_constant (&const_outgoing_args_size
)
6114 && frame
.saved_regs_size
.is_constant (&const_saved_regs_size
)
6115 && const_outgoing_args_size
+ const_saved_regs_size
< 512
6116 /* We could handle this case even with outgoing args, provided
6117 that the number of args left us with valid offsets for all
6118 predicate and vector save slots. It's such a rare case that
6119 it hardly seems worth the effort though. */
6120 && (!saves_below_hard_fp_p
|| const_outgoing_args_size
== 0)
6121 && !(cfun
->calls_alloca
6122 && frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6123 && const_fp_offset
< max_push_offset
))
6125 /* Frame with small outgoing arguments:
6127 sub sp, sp, frame_size
6128 stp reg1, reg2, [sp, outgoing_args_size]
6129 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6130 frame
.initial_adjust
= frame
.frame_size
;
6131 frame
.callee_offset
= const_outgoing_args_size
;
6133 else if (saves_below_hard_fp_p
6134 && known_eq (frame
.saved_regs_size
,
6135 frame
.below_hard_fp_saved_regs_size
))
6137 /* Frame in which all saves are SVE saves:
6139 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6140 save SVE registers relative to SP
6141 sub sp, sp, outgoing_args_size */
6142 frame
.initial_adjust
= (frame
.hard_fp_offset
6143 + frame
.below_hard_fp_saved_regs_size
);
6144 frame
.final_adjust
= crtl
->outgoing_args_size
;
6146 else if (frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
6147 && const_fp_offset
< max_push_offset
)
6149 /* Frame with large outgoing arguments or SVE saves, but with
6152 stp reg1, reg2, [sp, -hard_fp_offset]!
6153 stp reg3, reg4, [sp, 16]
6154 [sub sp, sp, below_hard_fp_saved_regs_size]
6155 [save SVE registers relative to SP]
6156 sub sp, sp, outgoing_args_size */
6157 frame
.callee_adjust
= const_fp_offset
;
6158 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6159 frame
.final_adjust
= crtl
->outgoing_args_size
;
6163 /* Frame with large local area and outgoing arguments or SVE saves,
6164 using frame pointer:
6166 sub sp, sp, hard_fp_offset
6167 stp x29, x30, [sp, 0]
6169 stp reg3, reg4, [sp, 16]
6170 [sub sp, sp, below_hard_fp_saved_regs_size]
6171 [save SVE registers relative to SP]
6172 sub sp, sp, outgoing_args_size */
6173 frame
.initial_adjust
= frame
.hard_fp_offset
;
6174 frame
.sve_callee_adjust
= frame
.below_hard_fp_saved_regs_size
;
6175 frame
.final_adjust
= crtl
->outgoing_args_size
;
6178 /* Make sure the individual adjustments add up to the full frame size. */
6179 gcc_assert (known_eq (frame
.initial_adjust
6180 + frame
.callee_adjust
6181 + frame
.sve_callee_adjust
6182 + frame
.final_adjust
, frame
.frame_size
));
6184 frame
.laid_out
= true;
6187 /* Return true if the register REGNO is saved on entry to
6188 the current function. */
6191 aarch64_register_saved_on_entry (int regno
)
6193 return known_ge (cfun
->machine
->frame
.reg_offset
[regno
], 0);
6196 /* Return the next register up from REGNO up to LIMIT for the callee
6200 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
6202 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
6207 /* Push the register number REGNO of mode MODE to the stack with write-back
6208 adjusting the stack by ADJUSTMENT. */
6211 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
6212 HOST_WIDE_INT adjustment
)
6214 rtx base_rtx
= stack_pointer_rtx
;
6217 reg
= gen_rtx_REG (mode
, regno
);
6218 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
6219 plus_constant (Pmode
, base_rtx
, -adjustment
));
6220 mem
= gen_frame_mem (mode
, mem
);
6222 insn
= emit_move_insn (mem
, reg
);
6223 RTX_FRAME_RELATED_P (insn
) = 1;
6226 /* Generate and return an instruction to store the pair of registers
6227 REG and REG2 of mode MODE to location BASE with write-back adjusting
6228 the stack location BASE by ADJUSTMENT. */
6231 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6232 HOST_WIDE_INT adjustment
)
6237 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
6238 GEN_INT (-adjustment
),
6239 GEN_INT (UNITS_PER_WORD
- adjustment
));
6241 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
6242 GEN_INT (-adjustment
),
6243 GEN_INT (UNITS_PER_WORD
- adjustment
));
6245 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
6246 GEN_INT (-adjustment
),
6247 GEN_INT (UNITS_PER_VREG
- adjustment
));
6253 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6254 stack pointer by ADJUSTMENT. */
6257 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
6260 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6262 if (regno2
== INVALID_REGNUM
)
6263 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
6265 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6266 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6268 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
6270 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
6271 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6272 RTX_FRAME_RELATED_P (insn
) = 1;
6275 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6276 adjusting it by ADJUSTMENT afterwards. */
6279 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
6280 HOST_WIDE_INT adjustment
)
6285 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6286 GEN_INT (UNITS_PER_WORD
));
6288 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6289 GEN_INT (UNITS_PER_WORD
));
6291 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
6292 GEN_INT (UNITS_PER_VREG
));
6298 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6299 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6303 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
6306 machine_mode mode
= aarch64_reg_save_mode (regno1
);
6307 rtx reg1
= gen_rtx_REG (mode
, regno1
);
6309 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
6311 if (regno2
== INVALID_REGNUM
)
6313 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
6314 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
6315 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
6319 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6320 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6321 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
6326 /* Generate and return a store pair instruction of mode MODE to store
6327 register REG1 to MEM1 and register REG2 to MEM2. */
6330 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
6336 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
6339 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
6342 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
6349 /* Generate and regurn a load pair isntruction of mode MODE to load register
6350 REG1 from MEM1 and register REG2 from MEM2. */
6353 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
6359 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
6362 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
6365 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
6372 /* Return TRUE if return address signing should be enabled for the current
6373 function, otherwise return FALSE. */
6376 aarch64_return_address_signing_enabled (void)
6378 /* This function should only be called after frame laid out. */
6379 gcc_assert (cfun
->machine
->frame
.laid_out
);
6381 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6382 if its LR is pushed onto stack. */
6383 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
6384 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
6385 && known_ge (cfun
->machine
->frame
.reg_offset
[LR_REGNUM
], 0)));
6388 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6390 aarch64_bti_enabled (void)
6392 return (aarch64_enable_bti
== 1);
6395 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6396 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6397 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6399 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6402 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6403 if the variable isn't already nonnull
6405 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6406 Handle this case using a temporary base register that is suitable for
6407 all offsets in that range. Use ANCHOR_REG as this base register if it
6408 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6411 aarch64_adjust_sve_callee_save_base (machine_mode mode
, rtx
&base_rtx
,
6412 rtx
&anchor_reg
, poly_int64
&offset
,
6415 if (maybe_ge (offset
, 8 * GET_MODE_SIZE (mode
)))
6417 /* This is the maximum valid offset of the anchor from the base.
6418 Lower values would be valid too. */
6419 poly_int64 anchor_offset
= 16 * GET_MODE_SIZE (mode
);
6422 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6423 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6424 gen_int_mode (anchor_offset
, Pmode
)));
6426 base_rtx
= anchor_reg
;
6427 offset
-= anchor_offset
;
6431 int pred_reg
= cfun
->machine
->frame
.spare_pred_reg
;
6432 emit_move_insn (gen_rtx_REG (VNx16BImode
, pred_reg
),
6433 CONSTM1_RTX (VNx16BImode
));
6434 ptrue
= gen_rtx_REG (VNx2BImode
, pred_reg
);
6438 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6439 is saved at BASE + OFFSET. */
6442 aarch64_add_cfa_expression (rtx_insn
*insn
, rtx reg
,
6443 rtx base
, poly_int64 offset
)
6445 rtx mem
= gen_frame_mem (GET_MODE (reg
),
6446 plus_constant (Pmode
, base
, offset
));
6447 add_reg_note (insn
, REG_CFA_EXPRESSION
, gen_rtx_SET (mem
, reg
));
6450 /* Emit code to save the callee-saved registers from register number START
6451 to LIMIT to the stack at the location starting at offset START_OFFSET,
6452 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6453 is true if the hard frame pointer has been set up. */
6456 aarch64_save_callee_saves (poly_int64 start_offset
,
6457 unsigned start
, unsigned limit
, bool skip_wb
,
6458 bool hard_fp_valid_p
)
6463 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6465 for (regno
= aarch64_next_callee_save (start
, limit
);
6467 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6471 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6474 && (regno
== cfun
->machine
->frame
.wb_candidate1
6475 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6478 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6481 machine_mode mode
= aarch64_reg_save_mode (regno
);
6482 reg
= gen_rtx_REG (mode
, regno
);
6483 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6484 rtx base_rtx
= stack_pointer_rtx
;
6485 poly_int64 sp_offset
= offset
;
6487 HOST_WIDE_INT const_offset
;
6488 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6489 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6491 else if (GP_REGNUM_P (regno
)
6492 && (!offset
.is_constant (&const_offset
) || const_offset
>= 512))
6494 gcc_assert (known_eq (start_offset
, 0));
6495 poly_int64 fp_offset
6496 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6497 if (hard_fp_valid_p
)
6498 base_rtx
= hard_frame_pointer_rtx
;
6503 anchor_reg
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6504 emit_insn (gen_add3_insn (anchor_reg
, base_rtx
,
6505 gen_int_mode (fp_offset
, Pmode
)));
6507 base_rtx
= anchor_reg
;
6509 offset
-= fp_offset
;
6511 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6512 bool need_cfa_note_p
= (base_rtx
!= stack_pointer_rtx
);
6514 if (!aarch64_sve_mode_p (mode
)
6515 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6516 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6517 && known_eq (GET_MODE_SIZE (mode
),
6518 cfun
->machine
->frame
.reg_offset
[regno2
]
6519 - cfun
->machine
->frame
.reg_offset
[regno
]))
6521 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6524 offset
+= GET_MODE_SIZE (mode
);
6525 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6526 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
6529 /* The first part of a frame-related parallel insn is
6530 always assumed to be relevant to the frame
6531 calculations; subsequent parts, are only
6532 frame-related if explicitly marked. */
6533 if (aarch64_emit_cfi_for_reg_p (regno2
))
6535 if (need_cfa_note_p
)
6536 aarch64_add_cfa_expression (insn
, reg2
, stack_pointer_rtx
,
6537 sp_offset
+ GET_MODE_SIZE (mode
));
6539 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
6544 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6546 insn
= emit_insn (gen_aarch64_pred_mov (mode
, mem
, ptrue
, reg
));
6547 need_cfa_note_p
= true;
6549 else if (aarch64_sve_mode_p (mode
))
6550 insn
= emit_insn (gen_rtx_SET (mem
, reg
));
6552 insn
= emit_move_insn (mem
, reg
);
6554 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
6555 if (frame_related_p
&& need_cfa_note_p
)
6556 aarch64_add_cfa_expression (insn
, reg
, stack_pointer_rtx
, sp_offset
);
6560 /* Emit code to restore the callee registers from register number START
6561 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6562 skipping any write-back candidates if SKIP_WB is true. Write the
6563 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6566 aarch64_restore_callee_saves (poly_int64 start_offset
, unsigned start
,
6567 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
6572 rtx anchor_reg
= NULL_RTX
, ptrue
= NULL_RTX
;
6574 for (regno
= aarch64_next_callee_save (start
, limit
);
6576 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
6578 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6579 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
6585 && (regno
== cfun
->machine
->frame
.wb_candidate1
6586 || regno
== cfun
->machine
->frame
.wb_candidate2
))
6589 machine_mode mode
= aarch64_reg_save_mode (regno
);
6590 reg
= gen_rtx_REG (mode
, regno
);
6591 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
6592 rtx base_rtx
= stack_pointer_rtx
;
6593 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6594 aarch64_adjust_sve_callee_save_base (mode
, base_rtx
, anchor_reg
,
6596 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6598 if (!aarch64_sve_mode_p (mode
)
6599 && (regno2
= aarch64_next_callee_save (regno
+ 1, limit
)) <= limit
6600 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
6601 && known_eq (GET_MODE_SIZE (mode
),
6602 cfun
->machine
->frame
.reg_offset
[regno2
]
6603 - cfun
->machine
->frame
.reg_offset
[regno
]))
6605 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6608 offset
+= GET_MODE_SIZE (mode
);
6609 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
6610 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6612 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
6615 else if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6616 emit_insn (gen_aarch64_pred_mov (mode
, reg
, ptrue
, mem
));
6617 else if (aarch64_sve_mode_p (mode
))
6618 emit_insn (gen_rtx_SET (reg
, mem
));
6620 emit_move_insn (reg
, mem
);
6621 if (frame_related_p
)
6622 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
6626 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6630 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6632 HOST_WIDE_INT multiple
;
6633 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6634 && IN_RANGE (multiple
, -8, 7));
6637 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6641 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
6643 HOST_WIDE_INT multiple
;
6644 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6645 && IN_RANGE (multiple
, 0, 63));
6648 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6652 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6654 HOST_WIDE_INT multiple
;
6655 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6656 && IN_RANGE (multiple
, -64, 63));
6659 /* Return true if OFFSET is a signed 9-bit value. */
6662 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
6665 HOST_WIDE_INT const_offset
;
6666 return (offset
.is_constant (&const_offset
)
6667 && IN_RANGE (const_offset
, -256, 255));
6670 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6674 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
6676 HOST_WIDE_INT multiple
;
6677 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6678 && IN_RANGE (multiple
, -256, 255));
6681 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6685 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
6687 HOST_WIDE_INT multiple
;
6688 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
6689 && IN_RANGE (multiple
, 0, 4095));
6692 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6695 aarch64_get_separate_components (void)
6697 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
6698 bitmap_clear (components
);
6700 /* The registers we need saved to the frame. */
6701 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6702 if (aarch64_register_saved_on_entry (regno
))
6704 /* Punt on saves and restores that use ST1D and LD1D. We could
6705 try to be smarter, but it would involve making sure that the
6706 spare predicate register itself is safe to use at the save
6707 and restore points. Also, when a frame pointer is being used,
6708 the slots are often out of reach of ST1D and LD1D anyway. */
6709 machine_mode mode
= aarch64_reg_save_mode (regno
);
6710 if (mode
== VNx2DImode
&& BYTES_BIG_ENDIAN
)
6713 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6715 /* If the register is saved in the first SVE save slot, we use
6716 it as a stack probe for -fstack-clash-protection. */
6717 if (flag_stack_clash_protection
6718 && maybe_ne (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0)
6719 && known_eq (offset
, 0))
6722 /* Get the offset relative to the register we'll use. */
6723 if (frame_pointer_needed
)
6724 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6726 offset
+= crtl
->outgoing_args_size
;
6728 /* Check that we can access the stack slot of the register with one
6729 direct load with no adjustments needed. */
6730 if (aarch64_sve_mode_p (mode
)
6731 ? offset_9bit_signed_scaled_p (mode
, offset
)
6732 : offset_12bit_unsigned_scaled_p (mode
, offset
))
6733 bitmap_set_bit (components
, regno
);
6736 /* Don't mess with the hard frame pointer. */
6737 if (frame_pointer_needed
)
6738 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
6740 /* If the spare predicate register used by big-endian SVE code
6741 is call-preserved, it must be saved in the main prologue
6742 before any saves that use it. */
6743 if (cfun
->machine
->frame
.spare_pred_reg
!= INVALID_REGNUM
)
6744 bitmap_clear_bit (components
, cfun
->machine
->frame
.spare_pred_reg
);
6746 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6747 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6748 /* If registers have been chosen to be stored/restored with
6749 writeback don't interfere with them to avoid having to output explicit
6750 stack adjustment instructions. */
6751 if (reg2
!= INVALID_REGNUM
)
6752 bitmap_clear_bit (components
, reg2
);
6753 if (reg1
!= INVALID_REGNUM
)
6754 bitmap_clear_bit (components
, reg1
);
6756 bitmap_clear_bit (components
, LR_REGNUM
);
6757 bitmap_clear_bit (components
, SP_REGNUM
);
6762 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6765 aarch64_components_for_bb (basic_block bb
)
6767 bitmap in
= DF_LIVE_IN (bb
);
6768 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
6769 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
6771 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
6772 bitmap_clear (components
);
6774 /* Clobbered registers don't generate values in any meaningful sense,
6775 since nothing after the clobber can rely on their value. And we can't
6776 say that partially-clobbered registers are unconditionally killed,
6777 because whether they're killed or not depends on the mode of the
6778 value they're holding. Thus partially call-clobbered registers
6779 appear in neither the kill set nor the gen set.
6781 Check manually for any calls that clobber more of a register than the
6782 current function can. */
6783 function_abi_aggregator callee_abis
;
6785 FOR_BB_INSNS (bb
, insn
)
6787 callee_abis
.note_callee_abi (insn_callee_abi (insn
));
6788 HARD_REG_SET extra_caller_saves
= callee_abis
.caller_save_regs (*crtl
->abi
);
6790 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6791 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6792 if (!fixed_regs
[regno
]
6793 && !crtl
->abi
->clobbers_full_reg_p (regno
)
6794 && (TEST_HARD_REG_BIT (extra_caller_saves
, regno
)
6795 || bitmap_bit_p (in
, regno
)
6796 || bitmap_bit_p (gen
, regno
)
6797 || bitmap_bit_p (kill
, regno
)))
6799 bitmap_set_bit (components
, regno
);
6801 /* If there is a callee-save at an adjacent offset, add it too
6802 to increase the use of LDP/STP. */
6803 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6804 unsigned regno2
= multiple_p (offset
, 16) ? regno
+ 1 : regno
- 1;
6806 if (regno2
<= LAST_SAVED_REGNUM
)
6808 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6810 ? known_eq (offset
+ 8, offset2
)
6811 : multiple_p (offset2
, 16) && known_eq (offset2
+ 8, offset
))
6812 bitmap_set_bit (components
, regno2
);
6819 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6820 Nothing to do for aarch64. */
6823 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
6827 /* Return the next set bit in BMP from START onwards. Return the total number
6828 of bits in BMP if no set bit is found at or after START. */
6831 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
6833 unsigned int nbits
= SBITMAP_SIZE (bmp
);
6837 gcc_assert (start
< nbits
);
6838 for (unsigned int i
= start
; i
< nbits
; i
++)
6839 if (bitmap_bit_p (bmp
, i
))
6845 /* Do the work for aarch64_emit_prologue_components and
6846 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6847 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6848 for these components or the epilogue sequence. That is, it determines
6849 whether we should emit stores or loads and what kind of CFA notes to attach
6850 to the insns. Otherwise the logic for the two sequences is very
6854 aarch64_process_components (sbitmap components
, bool prologue_p
)
6856 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
6857 ? HARD_FRAME_POINTER_REGNUM
6858 : STACK_POINTER_REGNUM
);
6860 unsigned last_regno
= SBITMAP_SIZE (components
);
6861 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
6862 rtx_insn
*insn
= NULL
;
6864 while (regno
!= last_regno
)
6866 bool frame_related_p
= aarch64_emit_cfi_for_reg_p (regno
);
6867 machine_mode mode
= aarch64_reg_save_mode (regno
);
6869 rtx reg
= gen_rtx_REG (mode
, regno
);
6870 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
6871 if (frame_pointer_needed
)
6872 offset
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6874 offset
+= crtl
->outgoing_args_size
;
6876 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
6877 rtx mem
= gen_frame_mem (mode
, addr
);
6879 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
6880 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
6881 /* No more registers to handle after REGNO.
6882 Emit a single save/restore and exit. */
6883 if (regno2
== last_regno
)
6885 insn
= emit_insn (set
);
6886 if (frame_related_p
)
6888 RTX_FRAME_RELATED_P (insn
) = 1;
6890 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6892 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6897 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
6898 /* The next register is not of the same class or its offset is not
6899 mergeable with the current one into a pair. */
6900 if (aarch64_sve_mode_p (mode
)
6901 || !satisfies_constraint_Ump (mem
)
6902 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
6903 || (crtl
->abi
->id () == ARM_PCS_SIMD
&& FP_REGNUM_P (regno
))
6904 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
6905 GET_MODE_SIZE (mode
)))
6907 insn
= emit_insn (set
);
6908 if (frame_related_p
)
6910 RTX_FRAME_RELATED_P (insn
) = 1;
6912 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
6914 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6921 bool frame_related2_p
= aarch64_emit_cfi_for_reg_p (regno2
);
6923 /* REGNO2 can be saved/restored in a pair with REGNO. */
6924 rtx reg2
= gen_rtx_REG (mode
, regno2
);
6925 if (frame_pointer_needed
)
6926 offset2
-= cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
6928 offset2
+= crtl
->outgoing_args_size
;
6929 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
6930 rtx mem2
= gen_frame_mem (mode
, addr2
);
6931 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
6932 : gen_rtx_SET (reg2
, mem2
);
6935 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
6937 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
6939 if (frame_related_p
|| frame_related2_p
)
6941 RTX_FRAME_RELATED_P (insn
) = 1;
6944 if (frame_related_p
)
6945 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6946 if (frame_related2_p
)
6947 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6951 if (frame_related_p
)
6952 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6953 if (frame_related2_p
)
6954 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6958 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6962 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6965 aarch64_emit_prologue_components (sbitmap components
)
6967 aarch64_process_components (components
, true);
6970 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6973 aarch64_emit_epilogue_components (sbitmap components
)
6975 aarch64_process_components (components
, false);
6978 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6981 aarch64_set_handled_components (sbitmap components
)
6983 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6984 if (bitmap_bit_p (components
, regno
))
6985 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6988 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6989 determining the probe offset for alloca. */
6991 static HOST_WIDE_INT
6992 aarch64_stack_clash_protection_alloca_probe_range (void)
6994 return STACK_CLASH_CALLER_GUARD
;
6998 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6999 registers. If POLY_SIZE is not large enough to require a probe this function
7000 will only adjust the stack. When allocating the stack space
7001 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7002 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7003 arguments. If we are then we ensure that any allocation larger than the ABI
7004 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7007 We emit barriers after each stack adjustment to prevent optimizations from
7008 breaking the invariant that we never drop the stack more than a page. This
7009 invariant is needed to make it easier to correctly handle asynchronous
7010 events, e.g. if we were to allow the stack to be dropped by more than a page
7011 and then have multiple probes up and we take a signal somewhere in between
7012 then the signal handler doesn't know the state of the stack and can make no
7013 assumptions about which pages have been probed. */
7016 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
7017 poly_int64 poly_size
,
7018 bool frame_related_p
,
7019 bool final_adjustment_p
)
7021 HOST_WIDE_INT guard_size
7022 = 1 << param_stack_clash_protection_guard_size
;
7023 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7024 HOST_WIDE_INT min_probe_threshold
7025 = (final_adjustment_p
7026 ? guard_used_by_caller
7027 : guard_size
- guard_used_by_caller
);
7028 /* When doing the final adjustment for the outgoing arguments, take into
7029 account any unprobed space there is above the current SP. There are
7032 - When saving SVE registers below the hard frame pointer, we force
7033 the lowest save to take place in the prologue before doing the final
7034 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7035 This acts as a probe at SP, so there is no unprobed space.
7037 - When there are no SVE register saves, we use the store of the link
7038 register as a probe. We can't assume that LR was saved at position 0
7039 though, so treat any space below it as unprobed. */
7040 if (final_adjustment_p
7041 && known_eq (cfun
->machine
->frame
.below_hard_fp_saved_regs_size
, 0))
7043 poly_int64 lr_offset
= cfun
->machine
->frame
.reg_offset
[LR_REGNUM
];
7044 if (known_ge (lr_offset
, 0))
7045 min_probe_threshold
-= lr_offset
.to_constant ();
7047 gcc_assert (!flag_stack_clash_protection
|| known_eq (poly_size
, 0));
7050 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7052 /* We should always have a positive probe threshold. */
7053 gcc_assert (min_probe_threshold
> 0);
7055 if (flag_stack_clash_protection
&& !final_adjustment_p
)
7057 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7058 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7059 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7061 if (known_eq (frame_size
, 0))
7063 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
7065 else if (known_lt (initial_adjust
+ sve_callee_adjust
,
7066 guard_size
- guard_used_by_caller
)
7067 && known_lt (final_adjust
, guard_used_by_caller
))
7069 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
7073 /* If SIZE is not large enough to require probing, just adjust the stack and
7075 if (known_lt (poly_size
, min_probe_threshold
)
7076 || !flag_stack_clash_protection
)
7078 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
7083 /* Handle the SVE non-constant case first. */
7084 if (!poly_size
.is_constant (&size
))
7088 fprintf (dump_file
, "Stack clash SVE prologue: ");
7089 print_dec (poly_size
, dump_file
);
7090 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
7093 /* First calculate the amount of bytes we're actually spilling. */
7094 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
7095 poly_size
, temp1
, temp2
, false, true);
7097 rtx_insn
*insn
= get_last_insn ();
7099 if (frame_related_p
)
7101 /* This is done to provide unwinding information for the stack
7102 adjustments we're about to do, however to prevent the optimizers
7103 from removing the R11 move and leaving the CFA note (which would be
7104 very wrong) we tie the old and new stack pointer together.
7105 The tie will expand to nothing but the optimizers will not touch
7107 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
7108 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
7109 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
7111 /* We want the CFA independent of the stack pointer for the
7112 duration of the loop. */
7113 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
7114 RTX_FRAME_RELATED_P (insn
) = 1;
7117 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
7118 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
7120 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
7121 stack_pointer_rtx
, temp1
,
7122 probe_const
, guard_const
));
7124 /* Now reset the CFA register if needed. */
7125 if (frame_related_p
)
7127 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7128 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
7129 gen_int_mode (poly_size
, Pmode
)));
7130 RTX_FRAME_RELATED_P (insn
) = 1;
7138 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7139 " bytes, probing will be required.\n", size
);
7141 /* Round size to the nearest multiple of guard_size, and calculate the
7142 residual as the difference between the original size and the rounded
7144 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
7145 HOST_WIDE_INT residual
= size
- rounded_size
;
7147 /* We can handle a small number of allocations/probes inline. Otherwise
7149 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
7151 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
7153 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
7154 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7155 guard_used_by_caller
));
7156 emit_insn (gen_blockage ());
7158 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
7162 /* Compute the ending address. */
7163 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
7164 temp1
, NULL
, false, true);
7165 rtx_insn
*insn
= get_last_insn ();
7167 /* For the initial allocation, we don't have a frame pointer
7168 set up, so we always need CFI notes. If we're doing the
7169 final allocation, then we may have a frame pointer, in which
7170 case it is the CFA, otherwise we need CFI notes.
7172 We can determine which allocation we are doing by looking at
7173 the value of FRAME_RELATED_P since the final allocations are not
7175 if (frame_related_p
)
7177 /* We want the CFA independent of the stack pointer for the
7178 duration of the loop. */
7179 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7180 plus_constant (Pmode
, temp1
, rounded_size
));
7181 RTX_FRAME_RELATED_P (insn
) = 1;
7184 /* This allocates and probes the stack. Note that this re-uses some of
7185 the existing Ada stack protection code. However we are guaranteed not
7186 to enter the non loop or residual branches of that code.
7188 The non-loop part won't be entered because if our allocation amount
7189 doesn't require a loop, the case above would handle it.
7191 The residual amount won't be entered because TEMP1 is a mutliple of
7192 the allocation size. The residual will always be 0. As such, the only
7193 part we are actually using from that code is the loop setup. The
7194 actual probing is done in aarch64_output_probe_stack_range. */
7195 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
7196 stack_pointer_rtx
, temp1
));
7198 /* Now reset the CFA register if needed. */
7199 if (frame_related_p
)
7201 add_reg_note (insn
, REG_CFA_DEF_CFA
,
7202 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
7203 RTX_FRAME_RELATED_P (insn
) = 1;
7206 emit_insn (gen_blockage ());
7207 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
7210 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7211 be probed. This maintains the requirement that each page is probed at
7212 least once. For initial probing we probe only if the allocation is
7213 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7214 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7215 GUARD_SIZE. This works that for any allocation that is large enough to
7216 trigger a probe here, we'll have at least one, and if they're not large
7217 enough for this code to emit anything for them, The page would have been
7218 probed by the saving of FP/LR either by this function or any callees. If
7219 we don't have any callees then we won't have more stack adjustments and so
7223 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
7224 /* If we're doing final adjustments, and we've done any full page
7225 allocations then any residual needs to be probed. */
7226 if (final_adjustment_p
&& rounded_size
!= 0)
7227 min_probe_threshold
= 0;
7228 /* If doing a small final adjustment, we always probe at offset 0.
7229 This is done to avoid issues when LR is not at position 0 or when
7230 the final adjustment is smaller than the probing offset. */
7231 else if (final_adjustment_p
&& rounded_size
== 0)
7232 residual_probe_offset
= 0;
7234 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
7235 if (residual
>= min_probe_threshold
)
7239 "Stack clash AArch64 prologue residuals: "
7240 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
7243 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
7244 residual_probe_offset
));
7245 emit_insn (gen_blockage ());
7250 /* Return 1 if the register is used by the epilogue. We need to say the
7251 return register is used, but only after epilogue generation is complete.
7252 Note that in the case of sibcalls, the values "used by the epilogue" are
7253 considered live at the start of the called function.
7255 For SIMD functions we need to return 1 for FP registers that are saved and
7256 restored by a function but are not zero in call_used_regs. If we do not do
7257 this optimizations may remove the restore of the register. */
7260 aarch64_epilogue_uses (int regno
)
7262 if (epilogue_completed
)
7264 if (regno
== LR_REGNUM
)
7270 /* AArch64 stack frames generated by this compiler look like:
7272 +-------------------------------+
7274 | incoming stack arguments |
7276 +-------------------------------+
7277 | | <-- incoming stack pointer (aligned)
7278 | callee-allocated save area |
7279 | for register varargs |
7281 +-------------------------------+
7282 | local variables | <-- frame_pointer_rtx
7284 +-------------------------------+
7286 +-------------------------------+ |
7287 | callee-saved registers | | frame.saved_regs_size
7288 +-------------------------------+ |
7290 +-------------------------------+ |
7292 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7293 | SVE vector registers | | \
7294 +-------------------------------+ | | below_hard_fp_saved_regs_size
7295 | SVE predicate registers | / /
7296 +-------------------------------+
7297 | dynamic allocation |
7298 +-------------------------------+
7300 +-------------------------------+
7301 | outgoing stack arguments | <-- arg_pointer
7303 +-------------------------------+
7304 | | <-- stack_pointer_rtx (aligned)
7306 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7307 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7310 By default for stack-clash we assume the guard is at least 64KB, but this
7311 value is configurable to either 4KB or 64KB. We also force the guard size to
7312 be the same as the probing interval and both values are kept in sync.
7314 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7315 on the guard size) of stack space without probing.
7317 When probing is needed, we emit a probe at the start of the prologue
7318 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7320 We have to track how much space has been allocated and the only stores
7321 to the stack we track as implicit probes are the FP/LR stores.
7323 For outgoing arguments we probe if the size is larger than 1KB, such that
7324 the ABI specified buffer is maintained for the next callee.
7326 The following registers are reserved during frame layout and should not be
7327 used for any other purpose:
7329 - r11: Used by stack clash protection when SVE is enabled, and also
7330 as an anchor register when saving and restoring registers
7331 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7332 - r14 and r15: Used for speculation tracking.
7333 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7334 - r30(LR), r29(FP): Used by standard frame layout.
7336 These registers must be avoided in frame layout related code unless the
7337 explicit intention is to interact with one of the features listed above. */
7339 /* Generate the prologue instructions for entry into a function.
7340 Establish the stack frame by decreasing the stack pointer with a
7341 properly calculated size and, if necessary, create a frame record
7342 filled with the values of LR and previous frame pointer. The
7343 current FP is also set up if it is in use. */
7346 aarch64_expand_prologue (void)
7348 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
7349 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7350 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7351 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7352 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7353 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7354 poly_int64 below_hard_fp_saved_regs_size
7355 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7356 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7357 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7358 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
7361 if (flag_stack_clash_protection
&& known_eq (callee_adjust
, 0))
7363 /* Fold the SVE allocation into the initial allocation.
7364 We don't do this in aarch64_layout_arg to avoid pessimizing
7365 the epilogue code. */
7366 initial_adjust
+= sve_callee_adjust
;
7367 sve_callee_adjust
= 0;
7370 /* Sign return address for functions. */
7371 if (aarch64_return_address_signing_enabled ())
7373 switch (aarch64_ra_sign_key
)
7376 insn
= emit_insn (gen_paciasp ());
7379 insn
= emit_insn (gen_pacibsp ());
7384 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7385 RTX_FRAME_RELATED_P (insn
) = 1;
7388 if (flag_stack_usage_info
)
7389 current_function_static_stack_size
= constant_lower_bound (frame_size
);
7391 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
7393 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
7395 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
7396 && maybe_gt (frame_size
, get_stack_check_protect ()))
7397 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7399 - get_stack_check_protect ()));
7401 else if (maybe_gt (frame_size
, 0))
7402 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
7405 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7406 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7408 /* In theory we should never have both an initial adjustment
7409 and a callee save adjustment. Verify that is the case since the
7410 code below does not handle it for -fstack-clash-protection. */
7411 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
7413 /* Will only probe if the initial adjustment is larger than the guard
7414 less the amount of the guard reserved for use by the caller's
7416 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7419 if (callee_adjust
!= 0)
7420 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
7422 /* The offset of the frame chain record (if any) from the current SP. */
7423 poly_int64 chain_offset
= (initial_adjust
+ callee_adjust
7424 - cfun
->machine
->frame
.hard_fp_offset
);
7425 gcc_assert (known_ge (chain_offset
, 0));
7427 /* The offset of the bottom of the save area from the current SP. */
7428 poly_int64 saved_regs_offset
= chain_offset
- below_hard_fp_saved_regs_size
;
7430 if (emit_frame_chain
)
7432 if (callee_adjust
== 0)
7436 aarch64_save_callee_saves (saved_regs_offset
, reg1
, reg2
,
7440 gcc_assert (known_eq (chain_offset
, 0));
7441 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
7442 stack_pointer_rtx
, chain_offset
,
7443 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
7444 if (frame_pointer_needed
&& !frame_size
.is_constant ())
7446 /* Variable-sized frames need to describe the save slot
7447 address using DW_CFA_expression rather than DW_CFA_offset.
7448 This means that, without taking further action, the
7449 locations of the registers that we've already saved would
7450 remain based on the stack pointer even after we redefine
7451 the CFA based on the frame pointer. We therefore need new
7452 DW_CFA_expressions to re-express the save slots with addresses
7453 based on the frame pointer. */
7454 rtx_insn
*insn
= get_last_insn ();
7455 gcc_assert (RTX_FRAME_RELATED_P (insn
));
7457 /* Add an explicit CFA definition if this was previously
7459 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
7461 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
7463 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
7464 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
7467 /* Change the save slot expressions for the registers that
7468 we've already saved. */
7469 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg2
],
7470 hard_frame_pointer_rtx
, UNITS_PER_WORD
);
7471 aarch64_add_cfa_expression (insn
, regno_reg_rtx
[reg1
],
7472 hard_frame_pointer_rtx
, 0);
7474 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
7477 aarch64_save_callee_saves (saved_regs_offset
, R0_REGNUM
, R30_REGNUM
,
7478 callee_adjust
!= 0 || emit_frame_chain
,
7480 if (maybe_ne (sve_callee_adjust
, 0))
7482 gcc_assert (!flag_stack_clash_protection
7483 || known_eq (initial_adjust
, 0));
7484 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
,
7486 !frame_pointer_needed
, false);
7487 saved_regs_offset
+= sve_callee_adjust
;
7489 aarch64_save_callee_saves (saved_regs_offset
, P0_REGNUM
, P15_REGNUM
,
7490 false, emit_frame_chain
);
7491 aarch64_save_callee_saves (saved_regs_offset
, V0_REGNUM
, V31_REGNUM
,
7492 callee_adjust
!= 0 || emit_frame_chain
,
7495 /* We may need to probe the final adjustment if it is larger than the guard
7496 that is assumed by the called. */
7497 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
7498 !frame_pointer_needed
, true);
7501 /* Return TRUE if we can use a simple_return insn.
7503 This function checks whether the callee saved stack is empty, which
7504 means no restore actions are need. The pro_and_epilogue will use
7505 this to check whether shrink-wrapping opt is feasible. */
7508 aarch64_use_return_insn_p (void)
7510 if (!reload_completed
)
7516 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
7519 /* Generate the epilogue instructions for returning from a function.
7520 This is almost exactly the reverse of the prolog sequence, except
7521 that we need to insert barriers to avoid scheduling loads that read
7522 from a deallocated stack, and we optimize the unwind records by
7523 emitting them all together if possible. */
7525 aarch64_expand_epilogue (bool for_sibcall
)
7527 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
7528 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
7529 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
7530 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
7531 poly_int64 sve_callee_adjust
= cfun
->machine
->frame
.sve_callee_adjust
;
7532 poly_int64 below_hard_fp_saved_regs_size
7533 = cfun
->machine
->frame
.below_hard_fp_saved_regs_size
;
7534 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
7535 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
7538 /* A stack clash protection prologue may not have left EP0_REGNUM or
7539 EP1_REGNUM in a usable state. The same is true for allocations
7540 with an SVE component, since we then need both temporary registers
7541 for each allocation. For stack clash we are in a usable state if
7542 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7543 HOST_WIDE_INT guard_size
7544 = 1 << param_stack_clash_protection_guard_size
;
7545 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
7547 /* We can re-use the registers when:
7549 (a) the deallocation amount is the same as the corresponding
7550 allocation amount (which is false if we combine the initial
7551 and SVE callee save allocations in the prologue); and
7553 (b) the allocation amount doesn't need a probe (which is false
7554 if the amount is guard_size - guard_used_by_caller or greater).
7556 In such situations the register should remain live with the correct
7558 bool can_inherit_p
= (initial_adjust
.is_constant ()
7559 && final_adjust
.is_constant ()
7560 && (!flag_stack_clash_protection
7561 || (known_lt (initial_adjust
,
7562 guard_size
- guard_used_by_caller
)
7563 && known_eq (sve_callee_adjust
, 0))));
7565 /* We need to add memory barrier to prevent read from deallocated stack. */
7567 = maybe_ne (get_frame_size ()
7568 + cfun
->machine
->frame
.saved_varargs_size
, 0);
7570 /* Emit a barrier to prevent loads from a deallocated stack. */
7571 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
7572 || cfun
->calls_alloca
7573 || crtl
->calls_eh_return
)
7575 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
7576 need_barrier_p
= false;
7579 /* Restore the stack pointer from the frame pointer if it may not
7580 be the same as the stack pointer. */
7581 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7582 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7583 if (frame_pointer_needed
7584 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
7585 /* If writeback is used when restoring callee-saves, the CFA
7586 is restored on the instruction doing the writeback. */
7587 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
7588 hard_frame_pointer_rtx
,
7589 -callee_offset
- below_hard_fp_saved_regs_size
,
7590 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
7592 /* The case where we need to re-use the register here is very rare, so
7593 avoid the complicated condition and just always emit a move if the
7594 immediate doesn't fit. */
7595 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
7597 /* Restore the vector registers before the predicate registers,
7598 so that we can use P4 as a temporary for big-endian SVE frames. */
7599 aarch64_restore_callee_saves (callee_offset
, V0_REGNUM
, V31_REGNUM
,
7600 callee_adjust
!= 0, &cfi_ops
);
7601 aarch64_restore_callee_saves (callee_offset
, P0_REGNUM
, P15_REGNUM
,
7603 if (maybe_ne (sve_callee_adjust
, 0))
7604 aarch64_add_sp (NULL_RTX
, NULL_RTX
, sve_callee_adjust
, true);
7605 aarch64_restore_callee_saves (callee_offset
- sve_callee_adjust
,
7606 R0_REGNUM
, R30_REGNUM
,
7607 callee_adjust
!= 0, &cfi_ops
);
7610 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
7612 if (callee_adjust
!= 0)
7613 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
7615 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
7617 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7618 insn
= get_last_insn ();
7619 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
7620 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
7621 RTX_FRAME_RELATED_P (insn
) = 1;
7625 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7626 add restriction on emit_move optimization to leaf functions. */
7627 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
7628 (!can_inherit_p
|| !crtl
->is_leaf
7629 || df_regs_ever_live_p (EP0_REGNUM
)));
7633 /* Emit delayed restores and reset the CFA to be SP. */
7634 insn
= get_last_insn ();
7635 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
7636 REG_NOTES (insn
) = cfi_ops
;
7637 RTX_FRAME_RELATED_P (insn
) = 1;
7640 /* We prefer to emit the combined return/authenticate instruction RETAA,
7641 however there are three cases in which we must instead emit an explicit
7642 authentication instruction.
7644 1) Sibcalls don't return in a normal way, so if we're about to call one
7645 we must authenticate.
7647 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7648 generating code for !TARGET_ARMV8_3 we can't use it and must
7649 explicitly authenticate.
7651 3) On an eh_return path we make extra stack adjustments to update the
7652 canonical frame address to be the exception handler's CFA. We want
7653 to authenticate using the CFA of the function which calls eh_return.
7655 if (aarch64_return_address_signing_enabled ()
7656 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
7658 switch (aarch64_ra_sign_key
)
7661 insn
= emit_insn (gen_autiasp ());
7664 insn
= emit_insn (gen_autibsp ());
7669 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
7670 RTX_FRAME_RELATED_P (insn
) = 1;
7673 /* Stack adjustment for exception handler. */
7674 if (crtl
->calls_eh_return
&& !for_sibcall
)
7676 /* We need to unwind the stack by the offset computed by
7677 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7678 to be SP; letting the CFA move during this adjustment
7679 is just as correct as retaining the CFA from the body
7680 of the function. Therefore, do nothing special. */
7681 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
7684 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
7686 emit_jump_insn (ret_rtx
);
7689 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7690 normally or return to a previous frame after unwinding.
7692 An EH return uses a single shared return sequence. The epilogue is
7693 exactly like a normal epilogue except that it has an extra input
7694 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7695 that must be applied after the frame has been destroyed. An extra label
7696 is inserted before the epilogue which initializes this register to zero,
7697 and this is the entry point for a normal return.
7699 An actual EH return updates the return address, initializes the stack
7700 adjustment and jumps directly into the epilogue (bypassing the zeroing
7701 of the adjustment). Since the return address is typically saved on the
7702 stack when a function makes a call, the saved LR must be updated outside
7705 This poses problems as the store is generated well before the epilogue,
7706 so the offset of LR is not known yet. Also optimizations will remove the
7707 store as it appears dead, even after the epilogue is generated (as the
7708 base or offset for loading LR is different in many cases).
7710 To avoid these problems this implementation forces the frame pointer
7711 in eh_return functions so that the location of LR is fixed and known early.
7712 It also marks the store volatile, so no optimization is permitted to
7713 remove the store. */
7715 aarch64_eh_return_handler_rtx (void)
7717 rtx tmp
= gen_frame_mem (Pmode
,
7718 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
7720 /* Mark the store volatile, so no optimization is permitted to remove it. */
7721 MEM_VOLATILE_P (tmp
) = true;
7725 /* Output code to add DELTA to the first argument, and then jump
7726 to FUNCTION. Used for C++ multiple inheritance. */
7728 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
7729 HOST_WIDE_INT delta
,
7730 HOST_WIDE_INT vcall_offset
,
7733 /* The this pointer is always in x0. Note that this differs from
7734 Arm where the this pointer maybe bumped to r1 if r0 is required
7735 to return a pointer to an aggregate. On AArch64 a result value
7736 pointer will be in x8. */
7737 int this_regno
= R0_REGNUM
;
7738 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
7740 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
7742 if (aarch64_bti_enabled ())
7743 emit_insn (gen_bti_c());
7745 reload_completed
= 1;
7746 emit_note (NOTE_INSN_PROLOGUE_END
);
7748 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
7749 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
7750 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
7752 if (vcall_offset
== 0)
7753 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
7756 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
7761 if (delta
>= -256 && delta
< 256)
7762 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
7763 plus_constant (Pmode
, this_rtx
, delta
));
7765 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
7766 temp1
, temp0
, false);
7769 if (Pmode
== ptr_mode
)
7770 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
7772 aarch64_emit_move (temp0
,
7773 gen_rtx_ZERO_EXTEND (Pmode
,
7774 gen_rtx_MEM (ptr_mode
, addr
)));
7776 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
7777 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
7780 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
7782 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
7785 if (Pmode
== ptr_mode
)
7786 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
7788 aarch64_emit_move (temp1
,
7789 gen_rtx_SIGN_EXTEND (Pmode
,
7790 gen_rtx_MEM (ptr_mode
, addr
)));
7792 emit_insn (gen_add2_insn (this_rtx
, temp1
));
7795 /* Generate a tail call to the target function. */
7796 if (!TREE_USED (function
))
7798 assemble_external (function
);
7799 TREE_USED (function
) = 1;
7801 funexp
= XEXP (DECL_RTL (function
), 0);
7802 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
7803 rtx callee_abi
= gen_int_mode (fndecl_abi (function
).id (), DImode
);
7804 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, callee_abi
));
7805 SIBLING_CALL_P (insn
) = 1;
7807 insn
= get_insns ();
7808 shorten_branches (insn
);
7810 assemble_start_function (thunk
, fnname
);
7811 final_start_function (insn
, file
, 1);
7812 final (insn
, file
, 1);
7813 final_end_function ();
7814 assemble_end_function (thunk
, fnname
);
7816 /* Stop pretending to be a post-reload pass. */
7817 reload_completed
= 0;
7821 aarch64_tls_referenced_p (rtx x
)
7823 if (!TARGET_HAVE_TLS
)
7825 subrtx_iterator::array_type array
;
7826 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7828 const_rtx x
= *iter
;
7829 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
7831 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7832 TLS offsets, not real symbol references. */
7833 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7834 iter
.skip_subrtxes ();
7840 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7841 a left shift of 0 or 12 bits. */
7843 aarch64_uimm12_shift (HOST_WIDE_INT val
)
7845 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
7846 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
7850 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7851 that can be created with a left shift of 0 or 12. */
7852 static HOST_WIDE_INT
7853 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
7855 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7856 handle correctly. */
7857 gcc_assert ((val
& 0xffffff) == val
);
7859 if (((val
& 0xfff) << 0) == val
)
7862 return val
& (0xfff << 12);
7865 /* Return true if val is an immediate that can be loaded into a
7866 register by a MOVZ instruction. */
7868 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
7870 if (GET_MODE_SIZE (mode
) > 4)
7872 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
7873 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
7878 /* Ignore sign extension. */
7879 val
&= (HOST_WIDE_INT
) 0xffffffff;
7881 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
7882 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
7885 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7886 64-bit (DImode) integer. */
7888 static unsigned HOST_WIDE_INT
7889 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
7891 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
7894 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
7901 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7903 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
7905 0x0000000100000001ull
,
7906 0x0001000100010001ull
,
7907 0x0101010101010101ull
,
7908 0x1111111111111111ull
,
7909 0x5555555555555555ull
,
7913 /* Return true if val is a valid bitmask immediate. */
7916 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
7918 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
7921 /* Check for a single sequence of one bits and return quickly if so.
7922 The special cases of all ones and all zeroes returns false. */
7923 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
7924 tmp
= val
+ (val
& -val
);
7926 if (tmp
== (tmp
& -tmp
))
7927 return (val
+ 1) > 1;
7929 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7931 val
= (val
<< 32) | (val
& 0xffffffff);
7933 /* Invert if the immediate doesn't start with a zero bit - this means we
7934 only need to search for sequences of one bits. */
7938 /* Find the first set bit and set tmp to val with the first sequence of one
7939 bits removed. Return success if there is a single sequence of ones. */
7940 first_one
= val
& -val
;
7941 tmp
= val
& (val
+ first_one
);
7946 /* Find the next set bit and compute the difference in bit position. */
7947 next_one
= tmp
& -tmp
;
7948 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
7951 /* Check the bit position difference is a power of 2, and that the first
7952 sequence of one bits fits within 'bits' bits. */
7953 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
7956 /* Check the sequence of one bits is repeated 64/bits times. */
7957 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
7960 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7961 Assumed precondition: VAL_IN Is not zero. */
7963 unsigned HOST_WIDE_INT
7964 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
7966 int lowest_bit_set
= ctz_hwi (val_in
);
7967 int highest_bit_set
= floor_log2 (val_in
);
7968 gcc_assert (val_in
!= 0);
7970 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
7971 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
7974 /* Create constant where bits outside of lowest bit set to highest bit set
7977 unsigned HOST_WIDE_INT
7978 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
7980 return val_in
| ~aarch64_and_split_imm1 (val_in
);
7983 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7986 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
7988 scalar_int_mode int_mode
;
7989 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7992 if (aarch64_bitmask_imm (val_in
, int_mode
))
7995 if (aarch64_move_imm (val_in
, int_mode
))
7998 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
8000 return aarch64_bitmask_imm (imm2
, int_mode
);
8003 /* Return true if val is an immediate that can be loaded into a
8004 register in a single instruction. */
8006 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
8008 scalar_int_mode int_mode
;
8009 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8012 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
8014 return aarch64_bitmask_imm (val
, int_mode
);
8018 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
8022 if (GET_CODE (x
) == HIGH
)
8025 /* There's no way to calculate VL-based values using relocations. */
8026 subrtx_iterator::array_type array
;
8027 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
8028 if (GET_CODE (*iter
) == CONST_POLY_INT
)
8031 split_const (x
, &base
, &offset
);
8032 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
8034 if (aarch64_classify_symbol (base
, INTVAL (offset
))
8035 != SYMBOL_FORCE_TO_MEM
)
8038 /* Avoid generating a 64-bit relocation in ILP32; leave
8039 to aarch64_expand_mov_immediate to handle it properly. */
8040 return mode
!= ptr_mode
;
8043 return aarch64_tls_referenced_p (x
);
8046 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8047 The expansion for a table switch is quite expensive due to the number
8048 of instructions, the table lookup and hard to predict indirect jump.
8049 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8050 set, otherwise use tables for > 16 cases as a tradeoff between size and
8051 performance. When optimizing for size, use the default setting. */
8054 aarch64_case_values_threshold (void)
8056 /* Use the specified limit for the number of cases before using jump
8057 tables at higher optimization levels. */
8059 && selected_cpu
->tune
->max_case_values
!= 0)
8060 return selected_cpu
->tune
->max_case_values
;
8062 return optimize_size
? default_case_values_threshold () : 17;
8065 /* Return true if register REGNO is a valid index register.
8066 STRICT_P is true if REG_OK_STRICT is in effect. */
8069 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
8071 if (!HARD_REGISTER_NUM_P (regno
))
8079 regno
= reg_renumber
[regno
];
8081 return GP_REGNUM_P (regno
);
8084 /* Return true if register REGNO is a valid base register for mode MODE.
8085 STRICT_P is true if REG_OK_STRICT is in effect. */
8088 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
8090 if (!HARD_REGISTER_NUM_P (regno
))
8098 regno
= reg_renumber
[regno
];
8101 /* The fake registers will be eliminated to either the stack or
8102 hard frame pointer, both of which are usually valid base registers.
8103 Reload deals with the cases where the eliminated form isn't valid. */
8104 return (GP_REGNUM_P (regno
)
8105 || regno
== SP_REGNUM
8106 || regno
== FRAME_POINTER_REGNUM
8107 || regno
== ARG_POINTER_REGNUM
);
8110 /* Return true if X is a valid base register for mode MODE.
8111 STRICT_P is true if REG_OK_STRICT is in effect. */
8114 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
8117 && GET_CODE (x
) == SUBREG
8118 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
8121 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
8124 /* Return true if address offset is a valid index. If it is, fill in INFO
8125 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8128 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
8129 machine_mode mode
, bool strict_p
)
8131 enum aarch64_address_type type
;
8136 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
8137 && GET_MODE (x
) == Pmode
)
8139 type
= ADDRESS_REG_REG
;
8143 /* (sign_extend:DI (reg:SI)) */
8144 else if ((GET_CODE (x
) == SIGN_EXTEND
8145 || GET_CODE (x
) == ZERO_EXTEND
)
8146 && GET_MODE (x
) == DImode
8147 && GET_MODE (XEXP (x
, 0)) == SImode
)
8149 type
= (GET_CODE (x
) == SIGN_EXTEND
)
8150 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8151 index
= XEXP (x
, 0);
8154 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8155 else if (GET_CODE (x
) == MULT
8156 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8157 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8158 && GET_MODE (XEXP (x
, 0)) == DImode
8159 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8160 && CONST_INT_P (XEXP (x
, 1)))
8162 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8163 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8164 index
= XEXP (XEXP (x
, 0), 0);
8165 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8167 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8168 else if (GET_CODE (x
) == ASHIFT
8169 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
8170 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
8171 && GET_MODE (XEXP (x
, 0)) == DImode
8172 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
8173 && CONST_INT_P (XEXP (x
, 1)))
8175 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
8176 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8177 index
= XEXP (XEXP (x
, 0), 0);
8178 shift
= INTVAL (XEXP (x
, 1));
8180 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8181 else if ((GET_CODE (x
) == SIGN_EXTRACT
8182 || GET_CODE (x
) == ZERO_EXTRACT
)
8183 && GET_MODE (x
) == DImode
8184 && GET_CODE (XEXP (x
, 0)) == MULT
8185 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8186 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8188 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8189 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8190 index
= XEXP (XEXP (x
, 0), 0);
8191 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8192 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8193 || INTVAL (XEXP (x
, 2)) != 0)
8196 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8197 (const_int 0xffffffff<<shift)) */
8198 else if (GET_CODE (x
) == AND
8199 && GET_MODE (x
) == DImode
8200 && GET_CODE (XEXP (x
, 0)) == MULT
8201 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8202 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8203 && CONST_INT_P (XEXP (x
, 1)))
8205 type
= ADDRESS_REG_UXTW
;
8206 index
= XEXP (XEXP (x
, 0), 0);
8207 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
8208 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8211 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8212 else if ((GET_CODE (x
) == SIGN_EXTRACT
8213 || GET_CODE (x
) == ZERO_EXTRACT
)
8214 && GET_MODE (x
) == DImode
8215 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8216 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8217 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
8219 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
8220 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
8221 index
= XEXP (XEXP (x
, 0), 0);
8222 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8223 if (INTVAL (XEXP (x
, 1)) != 32 + shift
8224 || INTVAL (XEXP (x
, 2)) != 0)
8227 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8228 (const_int 0xffffffff<<shift)) */
8229 else if (GET_CODE (x
) == AND
8230 && GET_MODE (x
) == DImode
8231 && GET_CODE (XEXP (x
, 0)) == ASHIFT
8232 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
8233 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
8234 && CONST_INT_P (XEXP (x
, 1)))
8236 type
= ADDRESS_REG_UXTW
;
8237 index
= XEXP (XEXP (x
, 0), 0);
8238 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
8239 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
8242 /* (mult:P (reg:P) (const_int scale)) */
8243 else if (GET_CODE (x
) == MULT
8244 && GET_MODE (x
) == Pmode
8245 && GET_MODE (XEXP (x
, 0)) == Pmode
8246 && CONST_INT_P (XEXP (x
, 1)))
8248 type
= ADDRESS_REG_REG
;
8249 index
= XEXP (x
, 0);
8250 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
8252 /* (ashift:P (reg:P) (const_int shift)) */
8253 else if (GET_CODE (x
) == ASHIFT
8254 && GET_MODE (x
) == Pmode
8255 && GET_MODE (XEXP (x
, 0)) == Pmode
8256 && CONST_INT_P (XEXP (x
, 1)))
8258 type
= ADDRESS_REG_REG
;
8259 index
= XEXP (x
, 0);
8260 shift
= INTVAL (XEXP (x
, 1));
8266 && GET_CODE (index
) == SUBREG
8267 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
8268 index
= SUBREG_REG (index
);
8270 if (aarch64_sve_data_mode_p (mode
))
8272 if (type
!= ADDRESS_REG_REG
8273 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
8279 && !(IN_RANGE (shift
, 1, 3)
8280 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
8285 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
8288 info
->offset
= index
;
8289 info
->shift
= shift
;
8296 /* Return true if MODE is one of the modes for which we
8297 support LDP/STP operations. */
8300 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
8302 return mode
== SImode
|| mode
== DImode
8303 || mode
== SFmode
|| mode
== DFmode
8304 || (aarch64_vector_mode_supported_p (mode
)
8305 && (known_eq (GET_MODE_SIZE (mode
), 8)
8306 || (known_eq (GET_MODE_SIZE (mode
), 16)
8307 && (aarch64_tune_params
.extra_tuning_flags
8308 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
8311 /* Return true if REGNO is a virtual pointer register, or an eliminable
8312 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8313 include stack_pointer or hard_frame_pointer. */
8315 virt_or_elim_regno_p (unsigned regno
)
8317 return ((regno
>= FIRST_VIRTUAL_REGISTER
8318 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
8319 || regno
== FRAME_POINTER_REGNUM
8320 || regno
== ARG_POINTER_REGNUM
);
8323 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8324 If it is, fill in INFO appropriately. STRICT_P is true if
8325 REG_OK_STRICT is in effect. */
8328 aarch64_classify_address (struct aarch64_address_info
*info
,
8329 rtx x
, machine_mode mode
, bool strict_p
,
8330 aarch64_addr_query_type type
)
8332 enum rtx_code code
= GET_CODE (x
);
8336 HOST_WIDE_INT const_size
;
8338 /* Whether a vector mode is partial doesn't affect address legitimacy.
8339 Partial vectors like VNx8QImode allow the same indexed addressing
8340 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8341 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8342 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8343 vec_flags
&= ~VEC_PARTIAL
;
8345 /* On BE, we use load/store pair for all large int mode load/stores.
8346 TI/TFmode may also use a load/store pair. */
8347 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
8348 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
8349 || type
== ADDR_QUERY_LDP_STP_N
8352 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
8354 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8355 corresponds to the actual size of the memory being loaded/stored and the
8356 mode of the corresponding addressing mode is half of that. */
8357 if (type
== ADDR_QUERY_LDP_STP_N
8358 && known_eq (GET_MODE_SIZE (mode
), 16))
8361 bool allow_reg_index_p
= (!load_store_pair_p
8362 && (known_lt (GET_MODE_SIZE (mode
), 16)
8363 || vec_flags
== VEC_ADVSIMD
8364 || vec_flags
& VEC_SVE_DATA
));
8366 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8367 [Rn, #offset, MUL VL]. */
8368 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
8369 && (code
!= REG
&& code
!= PLUS
))
8372 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8374 if (advsimd_struct_p
8375 && !BYTES_BIG_ENDIAN
8376 && (code
!= POST_INC
&& code
!= REG
))
8379 gcc_checking_assert (GET_MODE (x
) == VOIDmode
8380 || SCALAR_INT_MODE_P (GET_MODE (x
)));
8386 info
->type
= ADDRESS_REG_IMM
;
8388 info
->offset
= const0_rtx
;
8389 info
->const_offset
= 0;
8390 return aarch64_base_register_rtx_p (x
, strict_p
);
8398 && virt_or_elim_regno_p (REGNO (op0
))
8399 && poly_int_rtx_p (op1
, &offset
))
8401 info
->type
= ADDRESS_REG_IMM
;
8404 info
->const_offset
= offset
;
8409 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
8410 && aarch64_base_register_rtx_p (op0
, strict_p
)
8411 && poly_int_rtx_p (op1
, &offset
))
8413 info
->type
= ADDRESS_REG_IMM
;
8416 info
->const_offset
= offset
;
8418 /* TImode and TFmode values are allowed in both pairs of X
8419 registers and individual Q registers. The available
8421 X,X: 7-bit signed scaled offset
8422 Q: 9-bit signed offset
8423 We conservatively require an offset representable in either mode.
8424 When performing the check for pairs of X registers i.e. LDP/STP
8425 pass down DImode since that is the natural size of the LDP/STP
8426 instruction memory accesses. */
8427 if (mode
== TImode
|| mode
== TFmode
)
8428 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
8429 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8430 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
8432 /* A 7bit offset check because OImode will emit a ldp/stp
8433 instruction (only big endian will get here).
8434 For ldp/stp instructions, the offset is scaled for the size of a
8435 single element of the pair. */
8437 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
8439 /* Three 9/12 bit offsets checks because CImode will emit three
8440 ldr/str instructions (only big endian will get here). */
8442 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8443 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
8445 || offset_12bit_unsigned_scaled_p (V16QImode
,
8448 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8449 instructions (only big endian will get here). */
8451 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
8452 && aarch64_offset_7bit_signed_scaled_p (TImode
,
8455 /* Make "m" use the LD1 offset range for SVE data modes, so
8456 that pre-RTL optimizers like ivopts will work to that
8457 instead of the wider LDR/STR range. */
8458 if (vec_flags
== VEC_SVE_DATA
)
8459 return (type
== ADDR_QUERY_M
8460 ? offset_4bit_signed_scaled_p (mode
, offset
)
8461 : offset_9bit_signed_scaled_p (mode
, offset
));
8463 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
8465 poly_int64 end_offset
= (offset
8466 + GET_MODE_SIZE (mode
)
8467 - BYTES_PER_SVE_VECTOR
);
8468 return (type
== ADDR_QUERY_M
8469 ? offset_4bit_signed_scaled_p (mode
, offset
)
8470 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
8471 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
8475 if (vec_flags
== VEC_SVE_PRED
)
8476 return offset_9bit_signed_scaled_p (mode
, offset
);
8478 if (load_store_pair_p
)
8479 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8480 || known_eq (GET_MODE_SIZE (mode
), 8)
8481 || known_eq (GET_MODE_SIZE (mode
), 16))
8482 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8484 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
8485 || offset_12bit_unsigned_scaled_p (mode
, offset
));
8488 if (allow_reg_index_p
)
8490 /* Look for base + (scaled/extended) index register. */
8491 if (aarch64_base_register_rtx_p (op0
, strict_p
)
8492 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
8497 if (aarch64_base_register_rtx_p (op1
, strict_p
)
8498 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
8511 info
->type
= ADDRESS_REG_WB
;
8512 info
->base
= XEXP (x
, 0);
8513 info
->offset
= NULL_RTX
;
8514 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
8518 info
->type
= ADDRESS_REG_WB
;
8519 info
->base
= XEXP (x
, 0);
8520 if (GET_CODE (XEXP (x
, 1)) == PLUS
8521 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
8522 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
8523 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
8525 info
->offset
= XEXP (XEXP (x
, 1), 1);
8526 info
->const_offset
= offset
;
8528 /* TImode and TFmode values are allowed in both pairs of X
8529 registers and individual Q registers. The available
8531 X,X: 7-bit signed scaled offset
8532 Q: 9-bit signed offset
8533 We conservatively require an offset representable in either mode.
8535 if (mode
== TImode
|| mode
== TFmode
)
8536 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
8537 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
8539 if (load_store_pair_p
)
8540 return ((known_eq (GET_MODE_SIZE (mode
), 4)
8541 || known_eq (GET_MODE_SIZE (mode
), 8)
8542 || known_eq (GET_MODE_SIZE (mode
), 16))
8543 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
8545 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
8552 /* load literal: pc-relative constant pool entry. Only supported
8553 for SI mode or larger. */
8554 info
->type
= ADDRESS_SYMBOLIC
;
8556 if (!load_store_pair_p
8557 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
8562 split_const (x
, &sym
, &addend
);
8563 return ((GET_CODE (sym
) == LABEL_REF
8564 || (GET_CODE (sym
) == SYMBOL_REF
8565 && CONSTANT_POOL_ADDRESS_P (sym
)
8566 && aarch64_pcrelative_literal_loads
)));
8571 info
->type
= ADDRESS_LO_SUM
;
8572 info
->base
= XEXP (x
, 0);
8573 info
->offset
= XEXP (x
, 1);
8574 if (allow_reg_index_p
8575 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
8578 split_const (info
->offset
, &sym
, &offs
);
8579 if (GET_CODE (sym
) == SYMBOL_REF
8580 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
8581 == SYMBOL_SMALL_ABSOLUTE
))
8583 /* The symbol and offset must be aligned to the access size. */
8586 if (CONSTANT_POOL_ADDRESS_P (sym
))
8587 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
8588 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
8590 tree exp
= SYMBOL_REF_DECL (sym
);
8591 align
= TYPE_ALIGN (TREE_TYPE (exp
));
8592 align
= aarch64_constant_alignment (exp
, align
);
8594 else if (SYMBOL_REF_DECL (sym
))
8595 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
8596 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
8597 && SYMBOL_REF_BLOCK (sym
) != NULL
)
8598 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
8600 align
= BITS_PER_UNIT
;
8602 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
8603 if (known_eq (ref_size
, 0))
8604 ref_size
= GET_MODE_SIZE (DImode
);
8606 return (multiple_p (INTVAL (offs
), ref_size
)
8607 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
8617 /* Return true if the address X is valid for a PRFM instruction.
8618 STRICT_P is true if we should do strict checking with
8619 aarch64_classify_address. */
8622 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
8624 struct aarch64_address_info addr
;
8626 /* PRFM accepts the same addresses as DImode... */
8627 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
8631 /* ... except writeback forms. */
8632 return addr
.type
!= ADDRESS_REG_WB
;
8636 aarch64_symbolic_address_p (rtx x
)
8640 split_const (x
, &x
, &offset
);
8641 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
8644 /* Classify the base of symbolic expression X. */
8646 enum aarch64_symbol_type
8647 aarch64_classify_symbolic_expression (rtx x
)
8651 split_const (x
, &x
, &offset
);
8652 return aarch64_classify_symbol (x
, INTVAL (offset
));
8656 /* Return TRUE if X is a legitimate address for accessing memory in
8659 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
8661 struct aarch64_address_info addr
;
8663 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
8666 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8667 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8669 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
8670 aarch64_addr_query_type type
)
8672 struct aarch64_address_info addr
;
8674 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
8677 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8680 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
8681 poly_int64 orig_offset
,
8685 if (GET_MODE_SIZE (mode
).is_constant (&size
))
8687 HOST_WIDE_INT const_offset
, second_offset
;
8689 /* A general SVE offset is A * VQ + B. Remove the A component from
8690 coefficient 0 in order to get the constant B. */
8691 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
8693 /* Split an out-of-range address displacement into a base and
8694 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8695 range otherwise to increase opportunities for sharing the base
8696 address of different sizes. Unaligned accesses use the signed
8697 9-bit range, TImode/TFmode use the intersection of signed
8698 scaled 7-bit and signed 9-bit offset. */
8699 if (mode
== TImode
|| mode
== TFmode
)
8700 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
8701 else if ((const_offset
& (size
- 1)) != 0)
8702 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
8704 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
8706 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
8709 /* Split the offset into second_offset and the rest. */
8710 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
8711 *offset2
= gen_int_mode (second_offset
, Pmode
);
8716 /* Get the mode we should use as the basis of the range. For structure
8717 modes this is the mode of one vector. */
8718 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
8719 machine_mode step_mode
8720 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
8722 /* Get the "mul vl" multiplier we'd like to use. */
8723 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
8724 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
8725 if (vec_flags
& VEC_SVE_DATA
)
8726 /* LDR supports a 9-bit range, but the move patterns for
8727 structure modes require all vectors to be in range of the
8728 same base. The simplest way of accomodating that while still
8729 promoting reuse of anchor points between different modes is
8730 to use an 8-bit range unconditionally. */
8731 vnum
= ((vnum
+ 128) & 255) - 128;
8733 /* Predicates are only handled singly, so we might as well use
8735 vnum
= ((vnum
+ 256) & 511) - 256;
8739 /* Convert the "mul vl" multiplier into a byte offset. */
8740 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
8741 if (known_eq (second_offset
, orig_offset
))
8744 /* Split the offset into second_offset and the rest. */
8745 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
8746 *offset2
= gen_int_mode (second_offset
, Pmode
);
8751 /* Return the binary representation of floating point constant VALUE in INTVAL.
8752 If the value cannot be converted, return false without setting INTVAL.
8753 The conversion is done in the given MODE. */
8755 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
8758 /* We make a general exception for 0. */
8759 if (aarch64_float_const_zero_rtx_p (value
))
8765 scalar_float_mode mode
;
8766 if (GET_CODE (value
) != CONST_DOUBLE
8767 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
8768 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
8769 /* Only support up to DF mode. */
8770 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
8773 unsigned HOST_WIDE_INT ival
= 0;
8776 real_to_target (res
,
8777 CONST_DOUBLE_REAL_VALUE (value
),
8778 REAL_MODE_FORMAT (mode
));
8782 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
8783 ival
= zext_hwi (res
[order
], 32);
8784 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
8787 ival
= zext_hwi (res
[0], 32);
8793 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8794 single MOV(+MOVK) followed by an FMOV. */
8796 aarch64_float_const_rtx_p (rtx x
)
8798 machine_mode mode
= GET_MODE (x
);
8799 if (mode
== VOIDmode
)
8802 /* Determine whether it's cheaper to write float constants as
8803 mov/movk pairs over ldr/adrp pairs. */
8804 unsigned HOST_WIDE_INT ival
;
8806 if (GET_CODE (x
) == CONST_DOUBLE
8807 && SCALAR_FLOAT_MODE_P (mode
)
8808 && aarch64_reinterpret_float_as_int (x
, &ival
))
8810 scalar_int_mode imode
= (mode
== HFmode
8812 : int_mode_for_mode (mode
).require ());
8813 int num_instr
= aarch64_internal_mov_immediate
8814 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8815 return num_instr
< 3;
8821 /* Return TRUE if rtx X is immediate constant 0.0 */
8823 aarch64_float_const_zero_rtx_p (rtx x
)
8825 if (GET_MODE (x
) == VOIDmode
)
8828 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
8829 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
8830 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
8833 /* Return TRUE if rtx X is immediate constant that fits in a single
8834 MOVI immediate operation. */
8836 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
8842 scalar_int_mode imode
;
8843 unsigned HOST_WIDE_INT ival
;
8845 if (GET_CODE (x
) == CONST_DOUBLE
8846 && SCALAR_FLOAT_MODE_P (mode
))
8848 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
8851 /* We make a general exception for 0. */
8852 if (aarch64_float_const_zero_rtx_p (x
))
8855 imode
= int_mode_for_mode (mode
).require ();
8857 else if (GET_CODE (x
) == CONST_INT
8858 && is_a
<scalar_int_mode
> (mode
, &imode
))
8863 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8864 a 128 bit vector mode. */
8865 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
8867 vmode
= aarch64_simd_container_mode (imode
, width
);
8868 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
8870 return aarch64_simd_valid_immediate (v_op
, NULL
);
8874 /* Return the fixed registers used for condition codes. */
8877 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
8880 *p2
= INVALID_REGNUM
;
8884 /* This function is used by the call expanders of the machine description.
8885 RESULT is the register in which the result is returned. It's NULL for
8886 "call" and "sibcall".
8887 MEM is the location of the function call.
8888 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8889 SIBCALL indicates whether this function call is normal call or sibling call.
8890 It will generate different pattern accordingly. */
8893 aarch64_expand_call (rtx result
, rtx mem
, rtx callee_abi
, bool sibcall
)
8895 rtx call
, callee
, tmp
;
8899 gcc_assert (MEM_P (mem
));
8900 callee
= XEXP (mem
, 0);
8901 mode
= GET_MODE (callee
);
8902 gcc_assert (mode
== Pmode
);
8904 /* Decide if we should generate indirect calls by loading the
8905 address of the callee into a register before performing
8906 the branch-and-link. */
8907 if (SYMBOL_REF_P (callee
)
8908 ? (aarch64_is_long_call_p (callee
)
8909 || aarch64_is_noplt_call_p (callee
))
8911 XEXP (mem
, 0) = force_reg (mode
, callee
);
8913 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
8915 if (result
!= NULL_RTX
)
8916 call
= gen_rtx_SET (result
, call
);
8921 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
8923 gcc_assert (CONST_INT_P (callee_abi
));
8924 callee_abi
= gen_rtx_UNSPEC (DImode
, gen_rtvec (1, callee_abi
),
8927 vec
= gen_rtvec (3, call
, callee_abi
, tmp
);
8928 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
8930 aarch64_emit_call_insn (call
);
8933 /* Emit call insn with PAT and do aarch64-specific handling. */
8936 aarch64_emit_call_insn (rtx pat
)
8938 rtx insn
= emit_call_insn (pat
);
8940 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
8941 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
8942 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
8946 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
8948 machine_mode mode_x
= GET_MODE (x
);
8949 rtx_code code_x
= GET_CODE (x
);
8951 /* All floating point compares return CCFP if it is an equality
8952 comparison, and CCFPE otherwise. */
8953 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
8980 /* Equality comparisons of short modes against zero can be performed
8981 using the TST instruction with the appropriate bitmask. */
8982 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
8983 && (code
== EQ
|| code
== NE
)
8984 && (mode_x
== HImode
|| mode_x
== QImode
))
8987 /* Similarly, comparisons of zero_extends from shorter modes can
8988 be performed using an ANDS with an immediate mask. */
8989 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
8990 && (mode_x
== SImode
|| mode_x
== DImode
)
8991 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
8992 && (code
== EQ
|| code
== NE
))
8995 if ((mode_x
== SImode
|| mode_x
== DImode
)
8997 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
8998 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
9000 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
9001 && CONST_INT_P (XEXP (x
, 2)))))
9004 /* A compare with a shifted operand. Because of canonicalization,
9005 the comparison will have to be swapped when we emit the assembly
9007 if ((mode_x
== SImode
|| mode_x
== DImode
)
9008 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
9009 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
9010 || code_x
== LSHIFTRT
9011 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
9014 /* Similarly for a negated operand, but we can only do this for
9016 if ((mode_x
== SImode
|| mode_x
== DImode
)
9017 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
9018 && (code
== EQ
|| code
== NE
)
9022 /* A test for unsigned overflow from an addition. */
9023 if ((mode_x
== DImode
|| mode_x
== TImode
)
9024 && (code
== LTU
|| code
== GEU
)
9026 && rtx_equal_p (XEXP (x
, 0), y
))
9029 /* A test for unsigned overflow from an add with carry. */
9030 if ((mode_x
== DImode
|| mode_x
== TImode
)
9031 && (code
== LTU
|| code
== GEU
)
9033 && CONST_SCALAR_INT_P (y
)
9034 && (rtx_mode_t (y
, mode_x
)
9035 == (wi::shwi (1, mode_x
)
9036 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
9039 /* A test for signed overflow. */
9040 if ((mode_x
== DImode
|| mode_x
== TImode
)
9043 && GET_CODE (y
) == SIGN_EXTEND
)
9046 /* For everything else, return CCmode. */
9051 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
9054 aarch64_get_condition_code (rtx x
)
9056 machine_mode mode
= GET_MODE (XEXP (x
, 0));
9057 enum rtx_code comp_code
= GET_CODE (x
);
9059 if (GET_MODE_CLASS (mode
) != MODE_CC
)
9060 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
9061 return aarch64_get_condition_code_1 (mode
, comp_code
);
9065 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
9073 case GE
: return AARCH64_GE
;
9074 case GT
: return AARCH64_GT
;
9075 case LE
: return AARCH64_LS
;
9076 case LT
: return AARCH64_MI
;
9077 case NE
: return AARCH64_NE
;
9078 case EQ
: return AARCH64_EQ
;
9079 case ORDERED
: return AARCH64_VC
;
9080 case UNORDERED
: return AARCH64_VS
;
9081 case UNLT
: return AARCH64_LT
;
9082 case UNLE
: return AARCH64_LE
;
9083 case UNGT
: return AARCH64_HI
;
9084 case UNGE
: return AARCH64_PL
;
9092 case NE
: return AARCH64_NE
;
9093 case EQ
: return AARCH64_EQ
;
9094 case GE
: return AARCH64_GE
;
9095 case GT
: return AARCH64_GT
;
9096 case LE
: return AARCH64_LE
;
9097 case LT
: return AARCH64_LT
;
9098 case GEU
: return AARCH64_CS
;
9099 case GTU
: return AARCH64_HI
;
9100 case LEU
: return AARCH64_LS
;
9101 case LTU
: return AARCH64_CC
;
9109 case NE
: return AARCH64_NE
;
9110 case EQ
: return AARCH64_EQ
;
9111 case GE
: return AARCH64_LE
;
9112 case GT
: return AARCH64_LT
;
9113 case LE
: return AARCH64_GE
;
9114 case LT
: return AARCH64_GT
;
9115 case GEU
: return AARCH64_LS
;
9116 case GTU
: return AARCH64_CC
;
9117 case LEU
: return AARCH64_CS
;
9118 case LTU
: return AARCH64_HI
;
9126 case NE
: return AARCH64_NE
; /* = any */
9127 case EQ
: return AARCH64_EQ
; /* = none */
9128 case GE
: return AARCH64_PL
; /* = nfrst */
9129 case LT
: return AARCH64_MI
; /* = first */
9130 case GEU
: return AARCH64_CS
; /* = nlast */
9131 case GTU
: return AARCH64_HI
; /* = pmore */
9132 case LEU
: return AARCH64_LS
; /* = plast */
9133 case LTU
: return AARCH64_CC
; /* = last */
9141 case NE
: return AARCH64_NE
;
9142 case EQ
: return AARCH64_EQ
;
9143 case GE
: return AARCH64_PL
;
9144 case LT
: return AARCH64_MI
;
9152 case NE
: return AARCH64_NE
;
9153 case EQ
: return AARCH64_EQ
;
9161 case LTU
: return AARCH64_CS
;
9162 case GEU
: return AARCH64_CC
;
9170 case GEU
: return AARCH64_CS
;
9171 case LTU
: return AARCH64_CC
;
9179 case NE
: return AARCH64_VS
;
9180 case EQ
: return AARCH64_VC
;
9193 aarch64_const_vec_all_same_in_range_p (rtx x
,
9194 HOST_WIDE_INT minval
,
9195 HOST_WIDE_INT maxval
)
9198 return (const_vec_duplicate_p (x
, &elt
)
9199 && CONST_INT_P (elt
)
9200 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
9204 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
9206 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
9209 /* Return true if VEC is a constant in which every element is in the range
9210 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9213 aarch64_const_vec_all_in_range_p (rtx vec
,
9214 HOST_WIDE_INT minval
,
9215 HOST_WIDE_INT maxval
)
9217 if (GET_CODE (vec
) != CONST_VECTOR
9218 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
9222 if (!CONST_VECTOR_STEPPED_P (vec
))
9223 nunits
= const_vector_encoded_nelts (vec
);
9224 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
9227 for (int i
= 0; i
< nunits
; i
++)
9229 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
9230 if (!CONST_INT_P (vec_elem
)
9231 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
9238 #define AARCH64_CC_V 1
9239 #define AARCH64_CC_C (1 << 1)
9240 #define AARCH64_CC_Z (1 << 2)
9241 #define AARCH64_CC_N (1 << 3)
9243 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9244 static const int aarch64_nzcv_codes
[] =
9246 0, /* EQ, Z == 1. */
9247 AARCH64_CC_Z
, /* NE, Z == 0. */
9248 0, /* CS, C == 1. */
9249 AARCH64_CC_C
, /* CC, C == 0. */
9250 0, /* MI, N == 1. */
9251 AARCH64_CC_N
, /* PL, N == 0. */
9252 0, /* VS, V == 1. */
9253 AARCH64_CC_V
, /* VC, V == 0. */
9254 0, /* HI, C ==1 && Z == 0. */
9255 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
9256 AARCH64_CC_V
, /* GE, N == V. */
9257 0, /* LT, N != V. */
9258 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
9259 0, /* LE, !(Z == 0 && N == V). */
9264 /* Print floating-point vector immediate operand X to F, negating it
9265 first if NEGATE is true. Return true on success, false if it isn't
9266 a constant we can handle. */
9269 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
9273 if (!const_vec_duplicate_p (x
, &elt
))
9276 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
9278 r
= real_value_negate (&r
);
9280 /* Handle the SVE single-bit immediates specially, since they have a
9281 fixed form in the assembly syntax. */
9282 if (real_equal (&r
, &dconst0
))
9283 asm_fprintf (f
, "0.0");
9284 else if (real_equal (&r
, &dconst2
))
9285 asm_fprintf (f
, "2.0");
9286 else if (real_equal (&r
, &dconst1
))
9287 asm_fprintf (f
, "1.0");
9288 else if (real_equal (&r
, &dconsthalf
))
9289 asm_fprintf (f
, "0.5");
9292 const int buf_size
= 20;
9293 char float_buf
[buf_size
] = {'\0'};
9294 real_to_decimal_for_mode (float_buf
, &r
, buf_size
, buf_size
,
9296 asm_fprintf (f
, "%s", float_buf
);
9302 /* Return the equivalent letter for size. */
9304 sizetochar (int size
)
9308 case 64: return 'd';
9309 case 32: return 's';
9310 case 16: return 'h';
9311 case 8 : return 'b';
9312 default: gcc_unreachable ();
9316 /* Print operand X to file F in a target specific manner according to CODE.
9317 The acceptable formatting commands given by CODE are:
9318 'c': An integer or symbol address without a preceding #
9320 'C': Take the duplicated element in a vector constant
9321 and print it in hex.
9322 'D': Take the duplicated element in a vector constant
9323 and print it as an unsigned integer, in decimal.
9324 'e': Print the sign/zero-extend size as a character 8->b,
9325 16->h, 32->w. Can also be used for masks:
9326 0xff->b, 0xffff->h, 0xffffffff->w.
9327 'I': If the operand is a duplicated vector constant,
9328 replace it with the duplicated scalar. If the
9329 operand is then a floating-point constant, replace
9330 it with the integer bit representation. Print the
9331 transformed constant as a signed decimal number.
9332 'p': Prints N such that 2^N == X (X must be power of 2 and
9334 'P': Print the number of non-zero bits in X (a const_int).
9335 'H': Print the higher numbered register of a pair (TImode)
9337 'm': Print a condition (eq, ne, etc).
9338 'M': Same as 'm', but invert condition.
9339 'N': Take the duplicated element in a vector constant
9340 and print the negative of it in decimal.
9341 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9342 'S/T/U/V': Print a FP/SIMD register name for a register list.
9343 The register printed is the FP/SIMD register name
9344 of X + 0/1/2/3 for S/T/U/V.
9345 'R': Print a scalar Integer/FP/SIMD register name + 1.
9346 'X': Print bottom 16 bits of integer constant in hex.
9347 'w/x': Print a general register name or the zero register
9349 '0': Print a normal operand, if it's a general register,
9350 then we assume DImode.
9351 'k': Print NZCV for conditional compare instructions.
9352 'A': Output address constant representing the first
9353 argument of X, specifying a relocation offset
9355 'L': Output constant address specified by X
9356 with a relocation offset if appropriate.
9357 'G': Prints address of X, specifying a PC relative
9358 relocation mode if appropriate.
9359 'y': Output address of LDP or STP - this is used for
9360 some LDP/STPs which don't use a PARALLEL in their
9361 pattern (so the mode needs to be adjusted).
9362 'z': Output address of a typical LDP or STP. */
9365 aarch64_print_operand (FILE *f
, rtx x
, int code
)
9371 switch (GET_CODE (x
))
9374 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
9378 output_addr_const (f
, x
);
9382 if (GET_CODE (XEXP (x
, 0)) == PLUS
9383 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
9385 output_addr_const (f
, x
);
9391 output_operand_lossage ("unsupported operand for code '%c'", code
);
9397 x
= unwrap_const_vec_duplicate (x
);
9398 if (!CONST_INT_P (x
))
9400 output_operand_lossage ("invalid operand for '%%%c'", code
);
9404 HOST_WIDE_INT val
= INTVAL (x
);
9405 if ((val
& ~7) == 8 || val
== 0xff)
9407 else if ((val
& ~7) == 16 || val
== 0xffff)
9409 else if ((val
& ~7) == 32 || val
== 0xffffffff)
9413 output_operand_lossage ("invalid operand for '%%%c'", code
);
9423 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
9425 output_operand_lossage ("invalid operand for '%%%c'", code
);
9429 asm_fprintf (f
, "%d", n
);
9434 if (!CONST_INT_P (x
))
9436 output_operand_lossage ("invalid operand for '%%%c'", code
);
9440 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
9444 if (x
== const0_rtx
)
9446 asm_fprintf (f
, "xzr");
9450 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
9452 output_operand_lossage ("invalid operand for '%%%c'", code
);
9456 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
9461 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
9462 if (CONST_INT_P (x
))
9463 asm_fprintf (f
, "%wd", INTVAL (x
));
9466 output_operand_lossage ("invalid operand for '%%%c'", code
);
9476 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9477 if (x
== const_true_rtx
)
9484 if (!COMPARISON_P (x
))
9486 output_operand_lossage ("invalid operand for '%%%c'", code
);
9490 cond_code
= aarch64_get_condition_code (x
);
9491 gcc_assert (cond_code
>= 0);
9493 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
9494 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
9495 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
9497 fputs (aarch64_condition_codes
[cond_code
], f
);
9502 if (!const_vec_duplicate_p (x
, &elt
))
9504 output_operand_lossage ("invalid vector constant");
9508 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
9509 asm_fprintf (f
, "%wd", -INTVAL (elt
));
9510 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
9511 && aarch64_print_vector_float_operand (f
, x
, true))
9515 output_operand_lossage ("invalid vector constant");
9525 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
9527 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
9530 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
9537 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
9539 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
9542 asm_fprintf (f
, "%c%d",
9543 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
9544 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
9548 if (REG_P (x
) && FP_REGNUM_P (REGNO (x
)))
9549 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
9550 else if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
9551 asm_fprintf (f
, "x%d", REGNO (x
) - R0_REGNUM
+ 1);
9553 output_operand_lossage ("incompatible register operand for '%%%c'",
9558 if (!CONST_INT_P (x
))
9560 output_operand_lossage ("invalid operand for '%%%c'", code
);
9563 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
9568 /* Print a replicated constant in hex. */
9569 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
9571 output_operand_lossage ("invalid operand for '%%%c'", code
);
9574 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
9575 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
9581 /* Print a replicated constant in decimal, treating it as
9583 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
9585 output_operand_lossage ("invalid operand for '%%%c'", code
);
9588 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
9589 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
9596 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
9598 asm_fprintf (f
, "%czr", code
);
9602 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
9604 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
9608 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
9610 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
9619 output_operand_lossage ("missing operand");
9623 switch (GET_CODE (x
))
9626 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
9628 if (REG_NREGS (x
) == 1)
9629 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
9633 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
9634 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
9635 REGNO (x
) - V0_REGNUM
, suffix
,
9636 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
9640 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
9644 output_address (GET_MODE (x
), XEXP (x
, 0));
9649 output_addr_const (asm_out_file
, x
);
9653 asm_fprintf (f
, "%wd", INTVAL (x
));
9657 if (!VECTOR_MODE_P (GET_MODE (x
)))
9659 output_addr_const (asm_out_file
, x
);
9665 if (!const_vec_duplicate_p (x
, &elt
))
9667 output_operand_lossage ("invalid vector constant");
9671 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
9672 asm_fprintf (f
, "%wd", INTVAL (elt
));
9673 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
9674 && aarch64_print_vector_float_operand (f
, x
, false))
9678 output_operand_lossage ("invalid vector constant");
9684 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9685 be getting CONST_DOUBLEs holding integers. */
9686 gcc_assert (GET_MODE (x
) != VOIDmode
);
9687 if (aarch64_float_const_zero_rtx_p (x
))
9692 else if (aarch64_float_const_representable_p (x
))
9695 char float_buf
[buf_size
] = {'\0'};
9696 real_to_decimal_for_mode (float_buf
,
9697 CONST_DOUBLE_REAL_VALUE (x
),
9700 asm_fprintf (asm_out_file
, "%s", float_buf
);
9704 output_operand_lossage ("invalid constant");
9707 output_operand_lossage ("invalid operand");
9713 if (GET_CODE (x
) == HIGH
)
9716 switch (aarch64_classify_symbolic_expression (x
))
9718 case SYMBOL_SMALL_GOT_4G
:
9719 asm_fprintf (asm_out_file
, ":got:");
9722 case SYMBOL_SMALL_TLSGD
:
9723 asm_fprintf (asm_out_file
, ":tlsgd:");
9726 case SYMBOL_SMALL_TLSDESC
:
9727 asm_fprintf (asm_out_file
, ":tlsdesc:");
9730 case SYMBOL_SMALL_TLSIE
:
9731 asm_fprintf (asm_out_file
, ":gottprel:");
9734 case SYMBOL_TLSLE24
:
9735 asm_fprintf (asm_out_file
, ":tprel:");
9738 case SYMBOL_TINY_GOT
:
9745 output_addr_const (asm_out_file
, x
);
9749 switch (aarch64_classify_symbolic_expression (x
))
9751 case SYMBOL_SMALL_GOT_4G
:
9752 asm_fprintf (asm_out_file
, ":lo12:");
9755 case SYMBOL_SMALL_TLSGD
:
9756 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
9759 case SYMBOL_SMALL_TLSDESC
:
9760 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
9763 case SYMBOL_SMALL_TLSIE
:
9764 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
9767 case SYMBOL_TLSLE12
:
9768 asm_fprintf (asm_out_file
, ":tprel_lo12:");
9771 case SYMBOL_TLSLE24
:
9772 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
9775 case SYMBOL_TINY_GOT
:
9776 asm_fprintf (asm_out_file
, ":got:");
9779 case SYMBOL_TINY_TLSIE
:
9780 asm_fprintf (asm_out_file
, ":gottprel:");
9786 output_addr_const (asm_out_file
, x
);
9790 switch (aarch64_classify_symbolic_expression (x
))
9792 case SYMBOL_TLSLE24
:
9793 asm_fprintf (asm_out_file
, ":tprel_hi12:");
9798 output_addr_const (asm_out_file
, x
);
9803 HOST_WIDE_INT cond_code
;
9805 if (!CONST_INT_P (x
))
9807 output_operand_lossage ("invalid operand for '%%%c'", code
);
9811 cond_code
= INTVAL (x
);
9812 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
9813 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
9820 machine_mode mode
= GET_MODE (x
);
9822 if (GET_CODE (x
) != MEM
9823 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
9825 output_operand_lossage ("invalid operand for '%%%c'", code
);
9829 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
9831 ? ADDR_QUERY_LDP_STP_N
9832 : ADDR_QUERY_LDP_STP
))
9833 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
9838 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
9843 /* Print address 'x' of a memory access with mode 'mode'.
9844 'op' is the context required by aarch64_classify_address. It can either be
9845 MEM for a normal memory access or PARALLEL for LDP/STP. */
9847 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
9848 aarch64_addr_query_type type
)
9850 struct aarch64_address_info addr
;
9851 unsigned int size
, vec_flags
;
9853 /* Check all addresses are Pmode - including ILP32. */
9854 if (GET_MODE (x
) != Pmode
9855 && (!CONST_INT_P (x
)
9856 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
9858 output_operand_lossage ("invalid address mode");
9862 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
9865 case ADDRESS_REG_IMM
:
9866 if (known_eq (addr
.const_offset
, 0))
9868 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
9872 vec_flags
= aarch64_classify_vector_mode (mode
);
9873 if (vec_flags
& VEC_ANY_SVE
)
9876 = exact_div (addr
.const_offset
,
9877 aarch64_vl_bytes (mode
, vec_flags
)).to_constant ();
9878 asm_fprintf (f
, "[%s, #%wd, mul vl]",
9879 reg_names
[REGNO (addr
.base
)], vnum
);
9883 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
9884 INTVAL (addr
.offset
));
9887 case ADDRESS_REG_REG
:
9888 if (addr
.shift
== 0)
9889 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
9890 reg_names
[REGNO (addr
.offset
)]);
9892 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
9893 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
9896 case ADDRESS_REG_UXTW
:
9897 if (addr
.shift
== 0)
9898 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
9899 REGNO (addr
.offset
) - R0_REGNUM
);
9901 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
9902 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9905 case ADDRESS_REG_SXTW
:
9906 if (addr
.shift
== 0)
9907 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
9908 REGNO (addr
.offset
) - R0_REGNUM
);
9910 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
9911 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
9914 case ADDRESS_REG_WB
:
9915 /* Writeback is only supported for fixed-width modes. */
9916 size
= GET_MODE_SIZE (mode
).to_constant ();
9917 switch (GET_CODE (x
))
9920 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
9923 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
9926 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
9929 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
9932 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
9933 INTVAL (addr
.offset
));
9936 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
9937 INTVAL (addr
.offset
));
9944 case ADDRESS_LO_SUM
:
9945 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
9946 output_addr_const (f
, addr
.offset
);
9947 asm_fprintf (f
, "]");
9950 case ADDRESS_SYMBOLIC
:
9951 output_addr_const (f
, x
);
9958 /* Print address 'x' of a memory access with mode 'mode'. */
9960 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
9962 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
9963 output_addr_const (f
, x
);
9967 aarch64_label_mentioned_p (rtx x
)
9972 if (GET_CODE (x
) == LABEL_REF
)
9975 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9976 referencing instruction, but they are constant offsets, not
9978 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
9981 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
9982 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
9988 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
9989 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
9992 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
9999 /* Implement REGNO_REG_CLASS. */
10002 aarch64_regno_regclass (unsigned regno
)
10004 if (GP_REGNUM_P (regno
))
10005 return GENERAL_REGS
;
10007 if (regno
== SP_REGNUM
)
10010 if (regno
== FRAME_POINTER_REGNUM
10011 || regno
== ARG_POINTER_REGNUM
)
10012 return POINTER_REGS
;
10014 if (FP_REGNUM_P (regno
))
10015 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
10016 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
10018 if (PR_REGNUM_P (regno
))
10019 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
10021 if (regno
== FFR_REGNUM
|| regno
== FFRT_REGNUM
)
10027 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10028 If OFFSET is out of range, return an offset of an anchor point
10029 that is in range. Return 0 otherwise. */
10031 static HOST_WIDE_INT
10032 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
10035 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10037 return (offset
+ 0x400) & ~0x7f0;
10039 /* For offsets that aren't a multiple of the access size, the limit is
10041 if (offset
& (size
- 1))
10043 /* BLKmode typically uses LDP of X-registers. */
10044 if (mode
== BLKmode
)
10045 return (offset
+ 512) & ~0x3ff;
10046 return (offset
+ 0x100) & ~0x1ff;
10049 /* Small negative offsets are supported. */
10050 if (IN_RANGE (offset
, -256, 0))
10053 if (mode
== TImode
|| mode
== TFmode
)
10054 return (offset
+ 0x100) & ~0x1ff;
10056 /* Use 12-bit offset by access size. */
10057 return offset
& (~0xfff * size
);
10061 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
10063 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10064 where mask is selected by alignment and size of the offset.
10065 We try to pick as large a range for the offset as possible to
10066 maximize the chance of a CSE. However, for aligned addresses
10067 we limit the range to 4k so that structures with different sized
10068 elements are likely to use the same base. We need to be careful
10069 not to split a CONST for some forms of address expression, otherwise
10070 it will generate sub-optimal code. */
10072 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
10074 rtx base
= XEXP (x
, 0);
10075 rtx offset_rtx
= XEXP (x
, 1);
10076 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
10078 if (GET_CODE (base
) == PLUS
)
10080 rtx op0
= XEXP (base
, 0);
10081 rtx op1
= XEXP (base
, 1);
10083 /* Force any scaling into a temp for CSE. */
10084 op0
= force_reg (Pmode
, op0
);
10085 op1
= force_reg (Pmode
, op1
);
10087 /* Let the pointer register be in op0. */
10088 if (REG_POINTER (op1
))
10089 std::swap (op0
, op1
);
10091 /* If the pointer is virtual or frame related, then we know that
10092 virtual register instantiation or register elimination is going
10093 to apply a second constant. We want the two constants folded
10094 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10095 if (virt_or_elim_regno_p (REGNO (op0
)))
10097 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
10098 NULL_RTX
, true, OPTAB_DIRECT
);
10099 return gen_rtx_PLUS (Pmode
, base
, op1
);
10102 /* Otherwise, in order to encourage CSE (and thence loop strength
10103 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10104 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
10105 NULL_RTX
, true, OPTAB_DIRECT
);
10106 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
10109 HOST_WIDE_INT size
;
10110 if (GET_MODE_SIZE (mode
).is_constant (&size
))
10112 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
10114 if (base_offset
!= 0)
10116 base
= plus_constant (Pmode
, base
, base_offset
);
10117 base
= force_operand (base
, NULL_RTX
);
10118 return plus_constant (Pmode
, base
, offset
- base_offset
);
10127 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
10128 reg_class_t rclass
,
10130 secondary_reload_info
*sri
)
10132 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10133 LDR and STR. See the comment at the head of aarch64-sve.md for
10134 more details about the big-endian handling. */
10135 if (reg_class_subset_p (rclass
, FP_REGS
)
10136 && !((REG_P (x
) && HARD_REGISTER_P (x
))
10137 || aarch64_simd_valid_immediate (x
, NULL
))
10138 && mode
!= VNx16QImode
)
10140 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
10141 if ((vec_flags
& VEC_SVE_DATA
)
10142 && ((vec_flags
& VEC_PARTIAL
) || BYTES_BIG_ENDIAN
))
10144 sri
->icode
= CODE_FOR_aarch64_sve_reload_mem
;
10149 /* If we have to disable direct literal pool loads and stores because the
10150 function is too big, then we need a scratch register. */
10151 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
10152 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
10153 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
10154 && !aarch64_pcrelative_literal_loads
)
10156 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
10160 /* Without the TARGET_SIMD instructions we cannot move a Q register
10161 to a Q register directly. We need a scratch. */
10162 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
10163 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
10164 && reg_class_subset_p (rclass
, FP_REGS
))
10166 sri
->icode
= code_for_aarch64_reload_mov (mode
);
10170 /* A TFmode or TImode memory access should be handled via an FP_REGS
10171 because AArch64 has richer addressing modes for LDR/STR instructions
10172 than LDP/STP instructions. */
10173 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
10174 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
10177 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
10178 return GENERAL_REGS
;
10184 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
10186 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
10188 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10189 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10190 if (frame_pointer_needed
)
10191 return to
== HARD_FRAME_POINTER_REGNUM
;
10196 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
10198 if (to
== HARD_FRAME_POINTER_REGNUM
)
10200 if (from
== ARG_POINTER_REGNUM
)
10201 return cfun
->machine
->frame
.hard_fp_offset
;
10203 if (from
== FRAME_POINTER_REGNUM
)
10204 return cfun
->machine
->frame
.hard_fp_offset
10205 - cfun
->machine
->frame
.locals_offset
;
10208 if (to
== STACK_POINTER_REGNUM
)
10210 if (from
== FRAME_POINTER_REGNUM
)
10211 return cfun
->machine
->frame
.frame_size
10212 - cfun
->machine
->frame
.locals_offset
;
10215 return cfun
->machine
->frame
.frame_size
;
10218 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10222 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
10226 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
10231 aarch64_asm_trampoline_template (FILE *f
)
10236 if (aarch64_bti_enabled ())
10238 asm_fprintf (f
, "\thint\t34 // bti c\n");
10245 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
10246 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
10251 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
10252 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
10255 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
10257 /* The trampoline needs an extra padding instruction. In case if BTI is
10258 enabled the padding instruction is replaced by the BTI instruction at
10260 if (!aarch64_bti_enabled ())
10261 assemble_aligned_integer (4, const0_rtx
);
10263 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10264 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
10268 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
10270 rtx fnaddr
, mem
, a_tramp
;
10271 const int tramp_code_sz
= 16;
10273 /* Don't need to copy the trailing D-words, we fill those in below. */
10274 emit_block_move (m_tramp
, assemble_trampoline_template (),
10275 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
10276 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
10277 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
10278 if (GET_MODE (fnaddr
) != ptr_mode
)
10279 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
10280 emit_move_insn (mem
, fnaddr
);
10282 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
10283 emit_move_insn (mem
, chain_value
);
10285 /* XXX We should really define a "clear_cache" pattern and use
10286 gen_clear_cache(). */
10287 a_tramp
= XEXP (m_tramp
, 0);
10288 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
10289 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
10290 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
10294 static unsigned char
10295 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
10297 /* ??? Logically we should only need to provide a value when
10298 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10299 can hold MODE, but at the moment we need to handle all modes.
10300 Just ignore any runtime parts for registers that can't store them. */
10301 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
10302 unsigned int nregs
, vec_flags
;
10305 case TAILCALL_ADDR_REGS
:
10309 case POINTER_AND_FP_REGS
:
10313 vec_flags
= aarch64_classify_vector_mode (mode
);
10314 if ((vec_flags
& VEC_SVE_DATA
)
10315 && constant_multiple_p (GET_MODE_SIZE (mode
),
10316 aarch64_vl_bytes (mode
, vec_flags
), &nregs
))
10318 return (vec_flags
& VEC_ADVSIMD
10319 ? CEIL (lowest_size
, UNITS_PER_VREG
)
10320 : CEIL (lowest_size
, UNITS_PER_WORD
));
10326 case PR_AND_FFR_REGS
:
10335 gcc_unreachable ();
10339 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
10341 if (regclass
== POINTER_REGS
)
10342 return GENERAL_REGS
;
10344 if (regclass
== STACK_REG
)
10347 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
10353 /* Register eliminiation can result in a request for
10354 SP+constant->FP_REGS. We cannot support such operations which
10355 use SP as source and an FP_REG as destination, so reject out
10357 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
10359 rtx lhs
= XEXP (x
, 0);
10361 /* Look through a possible SUBREG introduced by ILP32. */
10362 if (GET_CODE (lhs
) == SUBREG
)
10363 lhs
= SUBREG_REG (lhs
);
10365 gcc_assert (REG_P (lhs
));
10366 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
10375 aarch64_asm_output_labelref (FILE* f
, const char *name
)
10377 asm_fprintf (f
, "%U%s", name
);
10381 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
10383 if (priority
== DEFAULT_INIT_PRIORITY
)
10384 default_ctor_section_asm_out_constructor (symbol
, priority
);
10388 /* While priority is known to be in range [0, 65535], so 18 bytes
10389 would be enough, the compiler might not know that. To avoid
10390 -Wformat-truncation false positive, use a larger size. */
10392 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
10393 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10394 switch_to_section (s
);
10395 assemble_align (POINTER_SIZE
);
10396 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10401 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
10403 if (priority
== DEFAULT_INIT_PRIORITY
)
10404 default_dtor_section_asm_out_destructor (symbol
, priority
);
10408 /* While priority is known to be in range [0, 65535], so 18 bytes
10409 would be enough, the compiler might not know that. To avoid
10410 -Wformat-truncation false positive, use a larger size. */
10412 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
10413 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
10414 switch_to_section (s
);
10415 assemble_align (POINTER_SIZE
);
10416 assemble_aligned_integer (POINTER_BYTES
, symbol
);
10421 aarch64_output_casesi (rtx
*operands
)
10425 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
10427 static const char *const patterns
[4][2] =
10430 "ldrb\t%w3, [%0,%w1,uxtw]",
10431 "add\t%3, %4, %w3, sxtb #2"
10434 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10435 "add\t%3, %4, %w3, sxth #2"
10438 "ldr\t%w3, [%0,%w1,uxtw #2]",
10439 "add\t%3, %4, %w3, sxtw #2"
10441 /* We assume that DImode is only generated when not optimizing and
10442 that we don't really need 64-bit address offsets. That would
10443 imply an object file with 8GB of code in a single function! */
10445 "ldr\t%w3, [%0,%w1,uxtw #2]",
10446 "add\t%3, %4, %w3, sxtw #2"
10450 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
10452 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
10453 index
= exact_log2 (GET_MODE_SIZE (mode
));
10455 gcc_assert (index
>= 0 && index
<= 3);
10457 /* Need to implement table size reduction, by chaning the code below. */
10458 output_asm_insn (patterns
[index
][0], operands
);
10459 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
10460 snprintf (buf
, sizeof (buf
),
10461 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
10462 output_asm_insn (buf
, operands
);
10463 output_asm_insn (patterns
[index
][1], operands
);
10464 output_asm_insn ("br\t%3", operands
);
10465 assemble_label (asm_out_file
, label
);
10470 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10471 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10475 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
10477 if (shift
>= 0 && shift
<= 3)
10480 for (size
= 8; size
<= 32; size
*= 2)
10482 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
10483 if (mask
== bits
<< shift
)
10490 /* Constant pools are per function only when PC relative
10491 literal loads are true or we are in the large memory
10495 aarch64_can_use_per_function_literal_pools_p (void)
10497 return (aarch64_pcrelative_literal_loads
10498 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
10502 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
10504 /* We can't use blocks for constants when we're using a per-function
10506 return !aarch64_can_use_per_function_literal_pools_p ();
10509 /* Select appropriate section for constants depending
10510 on where we place literal pools. */
10513 aarch64_select_rtx_section (machine_mode mode
,
10515 unsigned HOST_WIDE_INT align
)
10517 if (aarch64_can_use_per_function_literal_pools_p ())
10518 return function_section (current_function_decl
);
10520 return default_elf_select_rtx_section (mode
, x
, align
);
10523 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10525 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
10526 HOST_WIDE_INT offset
)
10528 /* When using per-function literal pools, we must ensure that any code
10529 section is aligned to the minimal instruction length, lest we get
10530 errors from the assembler re "unaligned instructions". */
10531 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
10532 ASM_OUTPUT_ALIGN (f
, 2);
10537 /* Helper function for rtx cost calculation. Strip a shift expression
10538 from X. Returns the inner operand if successful, or the original
10539 expression on failure. */
10541 aarch64_strip_shift (rtx x
)
10545 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10546 we can convert both to ROR during final output. */
10547 if ((GET_CODE (op
) == ASHIFT
10548 || GET_CODE (op
) == ASHIFTRT
10549 || GET_CODE (op
) == LSHIFTRT
10550 || GET_CODE (op
) == ROTATERT
10551 || GET_CODE (op
) == ROTATE
)
10552 && CONST_INT_P (XEXP (op
, 1)))
10553 return XEXP (op
, 0);
10555 if (GET_CODE (op
) == MULT
10556 && CONST_INT_P (XEXP (op
, 1))
10557 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
10558 return XEXP (op
, 0);
10563 /* Helper function for rtx cost calculation. Strip an extend
10564 expression from X. Returns the inner operand if successful, or the
10565 original expression on failure. We deal with a number of possible
10566 canonicalization variations here. If STRIP_SHIFT is true, then
10567 we can strip off a shift also. */
10569 aarch64_strip_extend (rtx x
, bool strip_shift
)
10571 scalar_int_mode mode
;
10574 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
10577 /* Zero and sign extraction of a widened value. */
10578 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
10579 && XEXP (op
, 2) == const0_rtx
10580 && GET_CODE (XEXP (op
, 0)) == MULT
10581 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
10583 return XEXP (XEXP (op
, 0), 0);
10585 /* It can also be represented (for zero-extend) as an AND with an
10587 if (GET_CODE (op
) == AND
10588 && GET_CODE (XEXP (op
, 0)) == MULT
10589 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
10590 && CONST_INT_P (XEXP (op
, 1))
10591 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
10592 INTVAL (XEXP (op
, 1))) != 0)
10593 return XEXP (XEXP (op
, 0), 0);
10595 /* Now handle extended register, as this may also have an optional
10596 left shift by 1..4. */
10598 && GET_CODE (op
) == ASHIFT
10599 && CONST_INT_P (XEXP (op
, 1))
10600 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
10603 if (GET_CODE (op
) == ZERO_EXTEND
10604 || GET_CODE (op
) == SIGN_EXTEND
)
10613 /* Return true iff CODE is a shift supported in combination
10614 with arithmetic instructions. */
10617 aarch64_shift_p (enum rtx_code code
)
10619 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
10623 /* Return true iff X is a cheap shift without a sign extend. */
10626 aarch64_cheap_mult_shift_p (rtx x
)
10633 if (!(aarch64_tune_params
.extra_tuning_flags
10634 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
10637 if (GET_CODE (op0
) == SIGN_EXTEND
)
10640 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
10641 && UINTVAL (op1
) <= 4)
10644 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
10647 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
10649 if (l2
> 0 && l2
<= 4)
10655 /* Helper function for rtx cost calculation. Calculate the cost of
10656 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10657 Return the calculated cost of the expression, recursing manually in to
10658 operands where needed. */
10661 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
10664 const struct cpu_cost_table
*extra_cost
10665 = aarch64_tune_params
.insn_extra_cost
;
10667 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
10668 machine_mode mode
= GET_MODE (x
);
10670 gcc_checking_assert (code
== MULT
);
10675 if (VECTOR_MODE_P (mode
))
10676 mode
= GET_MODE_INNER (mode
);
10678 /* Integer multiply/fma. */
10679 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10681 /* The multiply will be canonicalized as a shift, cost it as such. */
10682 if (aarch64_shift_p (GET_CODE (x
))
10683 || (CONST_INT_P (op1
)
10684 && exact_log2 (INTVAL (op1
)) > 0))
10686 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
10687 || GET_CODE (op0
) == SIGN_EXTEND
;
10692 /* If the shift is considered cheap,
10693 then don't add any cost. */
10694 if (aarch64_cheap_mult_shift_p (x
))
10696 else if (REG_P (op1
))
10697 /* ARITH + shift-by-register. */
10698 cost
+= extra_cost
->alu
.arith_shift_reg
;
10699 else if (is_extend
)
10700 /* ARITH + extended register. We don't have a cost field
10701 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10702 cost
+= extra_cost
->alu
.extend_arith
;
10704 /* ARITH + shift-by-immediate. */
10705 cost
+= extra_cost
->alu
.arith_shift
;
10708 /* LSL (immediate). */
10709 cost
+= extra_cost
->alu
.shift
;
10712 /* Strip extends as we will have costed them in the case above. */
10714 op0
= aarch64_strip_extend (op0
, true);
10716 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
10721 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10722 compound and let the below cases handle it. After all, MNEG is a
10723 special-case alias of MSUB. */
10724 if (GET_CODE (op0
) == NEG
)
10726 op0
= XEXP (op0
, 0);
10730 /* Integer multiplies or FMAs have zero/sign extending variants. */
10731 if ((GET_CODE (op0
) == ZERO_EXTEND
10732 && GET_CODE (op1
) == ZERO_EXTEND
)
10733 || (GET_CODE (op0
) == SIGN_EXTEND
10734 && GET_CODE (op1
) == SIGN_EXTEND
))
10736 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
10737 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
10742 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10743 cost
+= extra_cost
->mult
[0].extend_add
;
10745 /* MUL/SMULL/UMULL. */
10746 cost
+= extra_cost
->mult
[0].extend
;
10752 /* This is either an integer multiply or a MADD. In both cases
10753 we want to recurse and cost the operands. */
10754 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
10755 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
10761 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
10764 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
10773 /* Floating-point FMA/FMUL can also support negations of the
10774 operands, unless the rounding mode is upward or downward in
10775 which case FNMUL is different than FMUL with operand negation. */
10776 bool neg0
= GET_CODE (op0
) == NEG
;
10777 bool neg1
= GET_CODE (op1
) == NEG
;
10778 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
10781 op0
= XEXP (op0
, 0);
10783 op1
= XEXP (op1
, 0);
10787 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10788 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
10791 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
10794 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
10795 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
10801 aarch64_address_cost (rtx x
,
10803 addr_space_t as ATTRIBUTE_UNUSED
,
10806 enum rtx_code c
= GET_CODE (x
);
10807 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
10808 struct aarch64_address_info info
;
10812 if (!aarch64_classify_address (&info
, x
, mode
, false))
10814 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
10816 /* This is a CONST or SYMBOL ref which will be split
10817 in a different way depending on the code model in use.
10818 Cost it through the generic infrastructure. */
10819 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
10820 /* Divide through by the cost of one instruction to
10821 bring it to the same units as the address costs. */
10822 cost_symbol_ref
/= COSTS_N_INSNS (1);
10823 /* The cost is then the cost of preparing the address,
10824 followed by an immediate (possibly 0) offset. */
10825 return cost_symbol_ref
+ addr_cost
->imm_offset
;
10829 /* This is most likely a jump table from a case
10831 return addr_cost
->register_offset
;
10837 case ADDRESS_LO_SUM
:
10838 case ADDRESS_SYMBOLIC
:
10839 case ADDRESS_REG_IMM
:
10840 cost
+= addr_cost
->imm_offset
;
10843 case ADDRESS_REG_WB
:
10844 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
10845 cost
+= addr_cost
->pre_modify
;
10846 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
10847 cost
+= addr_cost
->post_modify
;
10849 gcc_unreachable ();
10853 case ADDRESS_REG_REG
:
10854 cost
+= addr_cost
->register_offset
;
10857 case ADDRESS_REG_SXTW
:
10858 cost
+= addr_cost
->register_sextend
;
10861 case ADDRESS_REG_UXTW
:
10862 cost
+= addr_cost
->register_zextend
;
10866 gcc_unreachable ();
10870 if (info
.shift
> 0)
10872 /* For the sake of calculating the cost of the shifted register
10873 component, we can treat same sized modes in the same way. */
10874 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
10875 cost
+= addr_cost
->addr_scale_costs
.hi
;
10876 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
10877 cost
+= addr_cost
->addr_scale_costs
.si
;
10878 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
10879 cost
+= addr_cost
->addr_scale_costs
.di
;
10881 /* We can't tell, or this is a 128-bit vector. */
10882 cost
+= addr_cost
->addr_scale_costs
.ti
;
10888 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10889 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10893 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
10895 /* When optimizing for speed, use the cost of unpredictable branches. */
10896 const struct cpu_branch_cost
*branch_costs
=
10897 aarch64_tune_params
.branch_costs
;
10899 if (!speed_p
|| predictable_p
)
10900 return branch_costs
->predictable
;
10902 return branch_costs
->unpredictable
;
10905 /* Return true if the RTX X in mode MODE is a zero or sign extract
10906 usable in an ADD or SUB (extended register) instruction. */
10908 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
10910 /* Catch add with a sign extract.
10911 This is add_<optab><mode>_multp2. */
10912 if (GET_CODE (x
) == SIGN_EXTRACT
10913 || GET_CODE (x
) == ZERO_EXTRACT
)
10915 rtx op0
= XEXP (x
, 0);
10916 rtx op1
= XEXP (x
, 1);
10917 rtx op2
= XEXP (x
, 2);
10919 if (GET_CODE (op0
) == MULT
10920 && CONST_INT_P (op1
)
10921 && op2
== const0_rtx
10922 && CONST_INT_P (XEXP (op0
, 1))
10923 && aarch64_is_extend_from_extract (mode
,
10930 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10932 else if (GET_CODE (x
) == SIGN_EXTEND
10933 || GET_CODE (x
) == ZERO_EXTEND
)
10934 return REG_P (XEXP (x
, 0));
10940 aarch64_frint_unspec_p (unsigned int u
)
10944 case UNSPEC_FRINTZ
:
10945 case UNSPEC_FRINTP
:
10946 case UNSPEC_FRINTM
:
10947 case UNSPEC_FRINTA
:
10948 case UNSPEC_FRINTN
:
10949 case UNSPEC_FRINTX
:
10950 case UNSPEC_FRINTI
:
10958 /* Return true iff X is an rtx that will match an extr instruction
10959 i.e. as described in the *extr<mode>5_insn family of patterns.
10960 OP0 and OP1 will be set to the operands of the shifts involved
10961 on success and will be NULL_RTX otherwise. */
10964 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
10967 scalar_int_mode mode
;
10968 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
10971 *res_op0
= NULL_RTX
;
10972 *res_op1
= NULL_RTX
;
10974 if (GET_CODE (x
) != IOR
)
10980 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
10981 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
10983 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10984 if (GET_CODE (op1
) == ASHIFT
)
10985 std::swap (op0
, op1
);
10987 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
10990 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
10991 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
10993 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
10994 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
10996 *res_op0
= XEXP (op0
, 0);
10997 *res_op1
= XEXP (op1
, 0);
11005 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11006 storing it in *COST. Result is true if the total cost of the operation
11007 has now been calculated. */
11009 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
11013 enum rtx_code cmpcode
;
11015 if (COMPARISON_P (op0
))
11017 inner
= XEXP (op0
, 0);
11018 comparator
= XEXP (op0
, 1);
11019 cmpcode
= GET_CODE (op0
);
11024 comparator
= const0_rtx
;
11028 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
11030 /* Conditional branch. */
11031 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11035 if (cmpcode
== NE
|| cmpcode
== EQ
)
11037 if (comparator
== const0_rtx
)
11039 /* TBZ/TBNZ/CBZ/CBNZ. */
11040 if (GET_CODE (inner
) == ZERO_EXTRACT
)
11042 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
11043 ZERO_EXTRACT
, 0, speed
);
11046 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
11051 else if (cmpcode
== LT
|| cmpcode
== GE
)
11054 if (comparator
== const0_rtx
)
11059 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
11062 if (GET_CODE (op1
) == COMPARE
)
11064 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11065 if (XEXP (op1
, 1) == const0_rtx
)
11069 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
11070 const struct cpu_cost_table
*extra_cost
11071 = aarch64_tune_params
.insn_extra_cost
;
11073 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11074 *cost
+= extra_cost
->alu
.arith
;
11076 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11081 /* It's a conditional operation based on the status flags,
11082 so it must be some flavor of CSEL. */
11084 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11085 if (GET_CODE (op1
) == NEG
11086 || GET_CODE (op1
) == NOT
11087 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
11088 op1
= XEXP (op1
, 0);
11089 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
11091 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11092 op1
= XEXP (op1
, 0);
11093 op2
= XEXP (op2
, 0);
11096 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
11097 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
11101 /* We don't know what this is, cost all operands. */
11105 /* Check whether X is a bitfield operation of the form shift + extend that
11106 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11107 operand to which the bitfield operation is applied. Otherwise return
11111 aarch64_extend_bitfield_pattern_p (rtx x
)
11113 rtx_code outer_code
= GET_CODE (x
);
11114 machine_mode outer_mode
= GET_MODE (x
);
11116 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
11117 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
11120 rtx inner
= XEXP (x
, 0);
11121 rtx_code inner_code
= GET_CODE (inner
);
11122 machine_mode inner_mode
= GET_MODE (inner
);
11125 switch (inner_code
)
11128 if (CONST_INT_P (XEXP (inner
, 1))
11129 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11130 op
= XEXP (inner
, 0);
11133 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11134 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11135 op
= XEXP (inner
, 0);
11138 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
11139 && (inner_mode
== QImode
|| inner_mode
== HImode
))
11140 op
= XEXP (inner
, 0);
11149 /* Return true if the mask and a shift amount from an RTX of the form
11150 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11151 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11154 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
11157 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
11158 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
11159 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
11161 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
11164 /* Return true if the masks and a shift amount from an RTX of the form
11165 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11166 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11169 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
11170 unsigned HOST_WIDE_INT mask1
,
11171 unsigned HOST_WIDE_INT shft_amnt
,
11172 unsigned HOST_WIDE_INT mask2
)
11174 unsigned HOST_WIDE_INT t
;
11176 /* Verify that there is no overlap in what bits are set in the two masks. */
11177 if (mask1
!= ~mask2
)
11180 /* Verify that mask2 is not all zeros or ones. */
11181 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
11184 /* The shift amount should always be less than the mode size. */
11185 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
11187 /* Verify that the mask being shifted is contiguous and would be in the
11188 least significant bits after shifting by shft_amnt. */
11189 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
11190 return (t
== (t
& -t
));
11193 /* Calculate the cost of calculating X, storing it in *COST. Result
11194 is true if the total cost of the operation has now been calculated. */
11196 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
11197 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
11200 const struct cpu_cost_table
*extra_cost
11201 = aarch64_tune_params
.insn_extra_cost
;
11202 int code
= GET_CODE (x
);
11203 scalar_int_mode int_mode
;
11205 /* By default, assume that everything has equivalent cost to the
11206 cheapest instruction. Any additional costs are applied as a delta
11207 above this default. */
11208 *cost
= COSTS_N_INSNS (1);
11213 /* The cost depends entirely on the operands to SET. */
11215 op0
= SET_DEST (x
);
11218 switch (GET_CODE (op0
))
11223 rtx address
= XEXP (op0
, 0);
11224 if (VECTOR_MODE_P (mode
))
11225 *cost
+= extra_cost
->ldst
.storev
;
11226 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11227 *cost
+= extra_cost
->ldst
.store
;
11228 else if (mode
== SFmode
)
11229 *cost
+= extra_cost
->ldst
.storef
;
11230 else if (mode
== DFmode
)
11231 *cost
+= extra_cost
->ldst
.stored
;
11234 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11238 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11242 if (! REG_P (SUBREG_REG (op0
)))
11243 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
11245 /* Fall through. */
11247 /* The cost is one per vector-register copied. */
11248 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
11250 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
11251 *cost
= COSTS_N_INSNS (nregs
);
11253 /* const0_rtx is in general free, but we will use an
11254 instruction to set a register to 0. */
11255 else if (REG_P (op1
) || op1
== const0_rtx
)
11257 /* The cost is 1 per register copied. */
11258 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
11259 *cost
= COSTS_N_INSNS (nregs
);
11262 /* Cost is just the cost of the RHS of the set. */
11263 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
11268 /* Bit-field insertion. Strip any redundant widening of
11269 the RHS to meet the width of the target. */
11270 if (GET_CODE (op1
) == SUBREG
)
11271 op1
= SUBREG_REG (op1
);
11272 if ((GET_CODE (op1
) == ZERO_EXTEND
11273 || GET_CODE (op1
) == SIGN_EXTEND
)
11274 && CONST_INT_P (XEXP (op0
, 1))
11275 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
11276 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
11277 op1
= XEXP (op1
, 0);
11279 if (CONST_INT_P (op1
))
11281 /* MOV immediate is assumed to always be cheap. */
11282 *cost
= COSTS_N_INSNS (1);
11288 *cost
+= extra_cost
->alu
.bfi
;
11289 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
11295 /* We can't make sense of this, assume default cost. */
11296 *cost
= COSTS_N_INSNS (1);
11302 /* If an instruction can incorporate a constant within the
11303 instruction, the instruction's expression avoids calling
11304 rtx_cost() on the constant. If rtx_cost() is called on a
11305 constant, then it is usually because the constant must be
11306 moved into a register by one or more instructions.
11308 The exception is constant 0, which can be expressed
11309 as XZR/WZR and is therefore free. The exception to this is
11310 if we have (set (reg) (const0_rtx)) in which case we must cost
11311 the move. However, we can catch that when we cost the SET, so
11312 we don't need to consider that here. */
11313 if (x
== const0_rtx
)
11317 /* To an approximation, building any other constant is
11318 proportionally expensive to the number of instructions
11319 required to build that constant. This is true whether we
11320 are compiling for SPEED or otherwise. */
11321 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
11322 int_mode
= word_mode
;
11323 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
11324 (NULL_RTX
, x
, false, int_mode
));
11330 /* First determine number of instructions to do the move
11331 as an integer constant. */
11332 if (!aarch64_float_const_representable_p (x
)
11333 && !aarch64_can_const_movi_rtx_p (x
, mode
)
11334 && aarch64_float_const_rtx_p (x
))
11336 unsigned HOST_WIDE_INT ival
;
11337 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
11338 gcc_assert (succeed
);
11340 scalar_int_mode imode
= (mode
== HFmode
11342 : int_mode_for_mode (mode
).require ());
11343 int ncost
= aarch64_internal_mov_immediate
11344 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
11345 *cost
+= COSTS_N_INSNS (ncost
);
11351 /* mov[df,sf]_aarch64. */
11352 if (aarch64_float_const_representable_p (x
))
11353 /* FMOV (scalar immediate). */
11354 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
11355 else if (!aarch64_float_const_zero_rtx_p (x
))
11357 /* This will be a load from memory. */
11358 if (mode
== DFmode
)
11359 *cost
+= extra_cost
->ldst
.loadd
;
11361 *cost
+= extra_cost
->ldst
.loadf
;
11364 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11365 or MOV v0.s[0], wzr - neither of which are modeled by the
11366 cost tables. Just use the default cost. */
11376 /* For loads we want the base cost of a load, plus an
11377 approximation for the additional cost of the addressing
11379 rtx address
= XEXP (x
, 0);
11380 if (VECTOR_MODE_P (mode
))
11381 *cost
+= extra_cost
->ldst
.loadv
;
11382 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11383 *cost
+= extra_cost
->ldst
.load
;
11384 else if (mode
== SFmode
)
11385 *cost
+= extra_cost
->ldst
.loadf
;
11386 else if (mode
== DFmode
)
11387 *cost
+= extra_cost
->ldst
.loadd
;
11390 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11399 if (VECTOR_MODE_P (mode
))
11404 *cost
+= extra_cost
->vect
.alu
;
11409 if (GET_MODE_CLASS (mode
) == MODE_INT
)
11411 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11412 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11415 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
11419 /* Cost this as SUB wzr, X. */
11420 op0
= CONST0_RTX (mode
);
11425 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11427 /* Support (neg(fma...)) as a single instruction only if
11428 sign of zeros is unimportant. This matches the decision
11429 making in aarch64.md. */
11430 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
11433 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11436 if (GET_CODE (op0
) == MULT
)
11439 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
11444 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11454 if (VECTOR_MODE_P (mode
))
11455 *cost
+= extra_cost
->vect
.alu
;
11457 *cost
+= extra_cost
->alu
.clz
;
11466 if (op1
== const0_rtx
11467 && GET_CODE (op0
) == AND
)
11470 mode
= GET_MODE (op0
);
11474 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
11476 /* TODO: A write to the CC flags possibly costs extra, this
11477 needs encoding in the cost tables. */
11479 mode
= GET_MODE (op0
);
11481 if (GET_CODE (op0
) == AND
)
11487 if (GET_CODE (op0
) == PLUS
)
11489 /* ADDS (and CMN alias). */
11494 if (GET_CODE (op0
) == MINUS
)
11501 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
11502 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
11503 && CONST_INT_P (XEXP (op0
, 2)))
11505 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11506 Handle it here directly rather than going to cost_logic
11507 since we know the immediate generated for the TST is valid
11508 so we can avoid creating an intermediate rtx for it only
11509 for costing purposes. */
11511 *cost
+= extra_cost
->alu
.logical
;
11513 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
11514 ZERO_EXTRACT
, 0, speed
);
11518 if (GET_CODE (op1
) == NEG
)
11522 *cost
+= extra_cost
->alu
.arith
;
11524 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
11525 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
11531 Compare can freely swap the order of operands, and
11532 canonicalization puts the more complex operation first.
11533 But the integer MINUS logic expects the shift/extend
11534 operation in op1. */
11536 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
11544 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
11548 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
11550 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
11552 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
11553 /* FCMP supports constant 0.0 for no extra cost. */
11559 if (VECTOR_MODE_P (mode
))
11561 /* Vector compare. */
11563 *cost
+= extra_cost
->vect
.alu
;
11565 if (aarch64_float_const_zero_rtx_p (op1
))
11567 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11581 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
11583 /* Detect valid immediates. */
11584 if ((GET_MODE_CLASS (mode
) == MODE_INT
11585 || (GET_MODE_CLASS (mode
) == MODE_CC
11586 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
11587 && CONST_INT_P (op1
)
11588 && aarch64_uimm12_shift (INTVAL (op1
)))
11591 /* SUB(S) (immediate). */
11592 *cost
+= extra_cost
->alu
.arith
;
11596 /* Look for SUB (extended register). */
11597 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
11598 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
11601 *cost
+= extra_cost
->alu
.extend_arith
;
11603 op1
= aarch64_strip_extend (op1
, true);
11604 *cost
+= rtx_cost (op1
, VOIDmode
,
11605 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
11609 rtx new_op1
= aarch64_strip_extend (op1
, false);
11611 /* Cost this as an FMA-alike operation. */
11612 if ((GET_CODE (new_op1
) == MULT
11613 || aarch64_shift_p (GET_CODE (new_op1
)))
11614 && code
!= COMPARE
)
11616 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
11617 (enum rtx_code
) code
,
11622 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
11626 if (VECTOR_MODE_P (mode
))
11629 *cost
+= extra_cost
->vect
.alu
;
11631 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11634 *cost
+= extra_cost
->alu
.arith
;
11636 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11639 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11653 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
11654 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
11657 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
11658 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
11662 if (GET_MODE_CLASS (mode
) == MODE_INT
11663 && (aarch64_plus_immediate (op1
, mode
)
11664 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
11666 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
11669 /* ADD (immediate). */
11670 *cost
+= extra_cost
->alu
.arith
;
11674 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
11676 /* Look for ADD (extended register). */
11677 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
11678 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
11681 *cost
+= extra_cost
->alu
.extend_arith
;
11683 op0
= aarch64_strip_extend (op0
, true);
11684 *cost
+= rtx_cost (op0
, VOIDmode
,
11685 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
11689 /* Strip any extend, leave shifts behind as we will
11690 cost them through mult_cost. */
11691 new_op0
= aarch64_strip_extend (op0
, false);
11693 if (GET_CODE (new_op0
) == MULT
11694 || aarch64_shift_p (GET_CODE (new_op0
)))
11696 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
11701 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
11705 if (VECTOR_MODE_P (mode
))
11708 *cost
+= extra_cost
->vect
.alu
;
11710 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11713 *cost
+= extra_cost
->alu
.arith
;
11715 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11718 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11725 *cost
= COSTS_N_INSNS (1);
11729 if (VECTOR_MODE_P (mode
))
11730 *cost
+= extra_cost
->vect
.alu
;
11732 *cost
+= extra_cost
->alu
.rev
;
11737 if (aarch_rev16_p (x
))
11739 *cost
= COSTS_N_INSNS (1);
11743 if (VECTOR_MODE_P (mode
))
11744 *cost
+= extra_cost
->vect
.alu
;
11746 *cost
+= extra_cost
->alu
.rev
;
11751 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
11753 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
11754 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
11756 *cost
+= extra_cost
->alu
.shift
;
11760 /* Fall through. */
11767 if (VECTOR_MODE_P (mode
))
11770 *cost
+= extra_cost
->vect
.alu
;
11775 && GET_CODE (op0
) == MULT
11776 && CONST_INT_P (XEXP (op0
, 1))
11777 && CONST_INT_P (op1
)
11778 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
11779 INTVAL (op1
)) != 0)
11781 /* This is a UBFM/SBFM. */
11782 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
11784 *cost
+= extra_cost
->alu
.bfx
;
11788 if (is_int_mode (mode
, &int_mode
))
11790 if (CONST_INT_P (op1
))
11792 /* We have a mask + shift version of a UBFIZ
11793 i.e. the *andim_ashift<mode>_bfiz pattern. */
11794 if (GET_CODE (op0
) == ASHIFT
11795 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
11798 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
11799 (enum rtx_code
) code
, 0, speed
);
11801 *cost
+= extra_cost
->alu
.bfx
;
11805 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
11807 /* We possibly get the immediate for free, this is not
11809 *cost
+= rtx_cost (op0
, int_mode
,
11810 (enum rtx_code
) code
, 0, speed
);
11812 *cost
+= extra_cost
->alu
.logical
;
11821 /* Handle ORN, EON, or BIC. */
11822 if (GET_CODE (op0
) == NOT
)
11823 op0
= XEXP (op0
, 0);
11825 new_op0
= aarch64_strip_shift (op0
);
11827 /* If we had a shift on op0 then this is a logical-shift-
11828 by-register/immediate operation. Otherwise, this is just
11829 a logical operation. */
11832 if (new_op0
!= op0
)
11834 /* Shift by immediate. */
11835 if (CONST_INT_P (XEXP (op0
, 1)))
11836 *cost
+= extra_cost
->alu
.log_shift
;
11838 *cost
+= extra_cost
->alu
.log_shift_reg
;
11841 *cost
+= extra_cost
->alu
.logical
;
11844 /* In both cases we want to cost both operands. */
11845 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
11847 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
11857 op0
= aarch64_strip_shift (x
);
11859 if (VECTOR_MODE_P (mode
))
11862 *cost
+= extra_cost
->vect
.alu
;
11866 /* MVN-shifted-reg. */
11869 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11872 *cost
+= extra_cost
->alu
.log_shift
;
11876 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11877 Handle the second form here taking care that 'a' in the above can
11879 else if (GET_CODE (op0
) == XOR
)
11881 rtx newop0
= XEXP (op0
, 0);
11882 rtx newop1
= XEXP (op0
, 1);
11883 rtx op0_stripped
= aarch64_strip_shift (newop0
);
11885 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
11886 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
11890 if (op0_stripped
!= newop0
)
11891 *cost
+= extra_cost
->alu
.log_shift
;
11893 *cost
+= extra_cost
->alu
.logical
;
11900 *cost
+= extra_cost
->alu
.logical
;
11907 /* If a value is written in SI mode, then zero extended to DI
11908 mode, the operation will in general be free as a write to
11909 a 'w' register implicitly zeroes the upper bits of an 'x'
11910 register. However, if this is
11912 (set (reg) (zero_extend (reg)))
11914 we must cost the explicit register move. */
11916 && GET_MODE (op0
) == SImode
11919 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
11921 /* If OP_COST is non-zero, then the cost of the zero extend
11922 is effectively the cost of the inner operation. Otherwise
11923 we have a MOV instruction and we take the cost from the MOV
11924 itself. This is true independently of whether we are
11925 optimizing for space or time. */
11931 else if (MEM_P (op0
))
11933 /* All loads can zero extend to any size for free. */
11934 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
11938 op0
= aarch64_extend_bitfield_pattern_p (x
);
11941 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
11943 *cost
+= extra_cost
->alu
.bfx
;
11949 if (VECTOR_MODE_P (mode
))
11952 *cost
+= extra_cost
->vect
.alu
;
11956 /* We generate an AND instead of UXTB/UXTH. */
11957 *cost
+= extra_cost
->alu
.logical
;
11963 if (MEM_P (XEXP (x
, 0)))
11968 rtx address
= XEXP (XEXP (x
, 0), 0);
11969 *cost
+= extra_cost
->ldst
.load_sign_extend
;
11972 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
11978 op0
= aarch64_extend_bitfield_pattern_p (x
);
11981 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
11983 *cost
+= extra_cost
->alu
.bfx
;
11989 if (VECTOR_MODE_P (mode
))
11990 *cost
+= extra_cost
->vect
.alu
;
11992 *cost
+= extra_cost
->alu
.extend
;
12000 if (CONST_INT_P (op1
))
12004 if (VECTOR_MODE_P (mode
))
12006 /* Vector shift (immediate). */
12007 *cost
+= extra_cost
->vect
.alu
;
12011 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12013 *cost
+= extra_cost
->alu
.shift
;
12017 /* We can incorporate zero/sign extend for free. */
12018 if (GET_CODE (op0
) == ZERO_EXTEND
12019 || GET_CODE (op0
) == SIGN_EXTEND
)
12020 op0
= XEXP (op0
, 0);
12022 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
12027 if (VECTOR_MODE_P (mode
))
12030 /* Vector shift (register). */
12031 *cost
+= extra_cost
->vect
.alu
;
12037 *cost
+= extra_cost
->alu
.shift_reg
;
12039 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12040 && CONST_INT_P (XEXP (op1
, 1))
12041 && known_eq (INTVAL (XEXP (op1
, 1)),
12042 GET_MODE_BITSIZE (mode
) - 1))
12044 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12045 /* We already demanded XEXP (op1, 0) to be REG_P, so
12046 don't recurse into it. */
12050 return false; /* All arguments need to be in registers. */
12060 if (CONST_INT_P (op1
))
12062 /* ASR (immediate) and friends. */
12065 if (VECTOR_MODE_P (mode
))
12066 *cost
+= extra_cost
->vect
.alu
;
12068 *cost
+= extra_cost
->alu
.shift
;
12071 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
12076 if (VECTOR_MODE_P (mode
))
12079 /* Vector shift (register). */
12080 *cost
+= extra_cost
->vect
.alu
;
12085 /* ASR (register) and friends. */
12086 *cost
+= extra_cost
->alu
.shift_reg
;
12088 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
12089 && CONST_INT_P (XEXP (op1
, 1))
12090 && known_eq (INTVAL (XEXP (op1
, 1)),
12091 GET_MODE_BITSIZE (mode
) - 1))
12093 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
12094 /* We already demanded XEXP (op1, 0) to be REG_P, so
12095 don't recurse into it. */
12099 return false; /* All arguments need to be in registers. */
12104 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
12105 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
12109 *cost
+= extra_cost
->ldst
.load
;
12111 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
12112 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
12114 /* ADRP, followed by ADD. */
12115 *cost
+= COSTS_N_INSNS (1);
12117 *cost
+= 2 * extra_cost
->alu
.arith
;
12119 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12120 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12124 *cost
+= extra_cost
->alu
.arith
;
12129 /* One extra load instruction, after accessing the GOT. */
12130 *cost
+= COSTS_N_INSNS (1);
12132 *cost
+= extra_cost
->ldst
.load
;
12138 /* ADRP/ADD (immediate). */
12140 *cost
+= extra_cost
->alu
.arith
;
12148 if (VECTOR_MODE_P (mode
))
12149 *cost
+= extra_cost
->vect
.alu
;
12151 *cost
+= extra_cost
->alu
.bfx
;
12154 /* We can trust that the immediates used will be correct (there
12155 are no by-register forms), so we need only cost op0. */
12156 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12160 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
12161 /* aarch64_rtx_mult_cost always handles recursion to its
12166 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12167 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12168 an unconditional negate. This case should only ever be reached through
12169 the set_smod_pow2_cheap check in expmed.c. */
12170 if (CONST_INT_P (XEXP (x
, 1))
12171 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
12172 && (mode
== SImode
|| mode
== DImode
))
12174 /* We expand to 4 instructions. Reset the baseline. */
12175 *cost
= COSTS_N_INSNS (4);
12178 *cost
+= 2 * extra_cost
->alu
.logical
12179 + 2 * extra_cost
->alu
.arith
;
12184 /* Fall-through. */
12188 /* Slighly prefer UMOD over SMOD. */
12189 if (VECTOR_MODE_P (mode
))
12190 *cost
+= extra_cost
->vect
.alu
;
12191 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12192 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
12193 + extra_cost
->mult
[mode
== DImode
].idiv
12194 + (code
== MOD
? 1 : 0));
12196 return false; /* All arguments need to be in registers. */
12203 if (VECTOR_MODE_P (mode
))
12204 *cost
+= extra_cost
->vect
.alu
;
12205 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
12206 /* There is no integer SQRT, so only DIV and UDIV can get
12208 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
12209 /* Slighly prefer UDIV over SDIV. */
12210 + (code
== DIV
? 1 : 0));
12212 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
12214 return false; /* All arguments need to be in registers. */
12217 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
12218 XEXP (x
, 2), cost
, speed
);
12231 return false; /* All arguments must be in registers. */
12240 if (VECTOR_MODE_P (mode
))
12241 *cost
+= extra_cost
->vect
.alu
;
12243 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
12246 /* FMSUB, FNMADD, and FNMSUB are free. */
12247 if (GET_CODE (op0
) == NEG
)
12248 op0
= XEXP (op0
, 0);
12250 if (GET_CODE (op2
) == NEG
)
12251 op2
= XEXP (op2
, 0);
12253 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12254 and the by-element operand as operand 0. */
12255 if (GET_CODE (op1
) == NEG
)
12256 op1
= XEXP (op1
, 0);
12258 /* Catch vector-by-element operations. The by-element operand can
12259 either be (vec_duplicate (vec_select (x))) or just
12260 (vec_select (x)), depending on whether we are multiplying by
12261 a vector or a scalar.
12263 Canonicalization is not very good in these cases, FMA4 will put the
12264 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12265 if (GET_CODE (op0
) == VEC_DUPLICATE
)
12266 op0
= XEXP (op0
, 0);
12267 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
12268 op1
= XEXP (op1
, 0);
12270 if (GET_CODE (op0
) == VEC_SELECT
)
12271 op0
= XEXP (op0
, 0);
12272 else if (GET_CODE (op1
) == VEC_SELECT
)
12273 op1
= XEXP (op1
, 0);
12275 /* If the remaining parameters are not registers,
12276 get the cost to put them into registers. */
12277 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
12278 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
12279 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
12283 case UNSIGNED_FLOAT
:
12285 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
12291 if (VECTOR_MODE_P (mode
))
12293 /*Vector truncate. */
12294 *cost
+= extra_cost
->vect
.alu
;
12297 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
12301 case FLOAT_TRUNCATE
:
12304 if (VECTOR_MODE_P (mode
))
12306 /*Vector conversion. */
12307 *cost
+= extra_cost
->vect
.alu
;
12310 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
12317 /* Strip the rounding part. They will all be implemented
12318 by the fcvt* family of instructions anyway. */
12319 if (GET_CODE (x
) == UNSPEC
)
12321 unsigned int uns_code
= XINT (x
, 1);
12323 if (uns_code
== UNSPEC_FRINTA
12324 || uns_code
== UNSPEC_FRINTM
12325 || uns_code
== UNSPEC_FRINTN
12326 || uns_code
== UNSPEC_FRINTP
12327 || uns_code
== UNSPEC_FRINTZ
)
12328 x
= XVECEXP (x
, 0, 0);
12333 if (VECTOR_MODE_P (mode
))
12334 *cost
+= extra_cost
->vect
.alu
;
12336 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
12339 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12340 fixed-point fcvt. */
12341 if (GET_CODE (x
) == MULT
12342 && ((VECTOR_MODE_P (mode
)
12343 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
12344 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
12346 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
12351 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
12355 if (VECTOR_MODE_P (mode
))
12357 /* ABS (vector). */
12359 *cost
+= extra_cost
->vect
.alu
;
12361 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12365 /* FABD, which is analogous to FADD. */
12366 if (GET_CODE (op0
) == MINUS
)
12368 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
12369 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
12371 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12375 /* Simple FABS is analogous to FNEG. */
12377 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
12381 /* Integer ABS will either be split to
12382 two arithmetic instructions, or will be an ABS
12383 (scalar), which we don't model. */
12384 *cost
= COSTS_N_INSNS (2);
12386 *cost
+= 2 * extra_cost
->alu
.arith
;
12394 if (VECTOR_MODE_P (mode
))
12395 *cost
+= extra_cost
->vect
.alu
;
12398 /* FMAXNM/FMINNM/FMAX/FMIN.
12399 TODO: This may not be accurate for all implementations, but
12400 we do not model this in the cost tables. */
12401 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
12407 /* The floating point round to integer frint* instructions. */
12408 if (aarch64_frint_unspec_p (XINT (x
, 1)))
12411 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
12416 if (XINT (x
, 1) == UNSPEC_RBIT
)
12419 *cost
+= extra_cost
->alu
.rev
;
12427 /* Decompose <su>muldi3_highpart. */
12428 if (/* (truncate:DI */
12431 && GET_MODE (XEXP (x
, 0)) == TImode
12432 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
12434 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
12435 /* (ANY_EXTEND:TI (reg:DI))
12436 (ANY_EXTEND:TI (reg:DI))) */
12437 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
12438 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
12439 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
12440 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
12441 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
12442 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
12443 /* (const_int 64) */
12444 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
12445 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
12449 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
12450 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
12451 mode
, MULT
, 0, speed
);
12452 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
12453 mode
, MULT
, 1, speed
);
12457 /* Fall through. */
12463 && flag_aarch64_verbose_cost
)
12464 fprintf (dump_file
,
12465 "\nFailed to cost RTX. Assuming default cost.\n");
12470 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12471 calculated for X. This cost is stored in *COST. Returns true
12472 if the total cost of X was calculated. */
12474 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
12475 int param
, int *cost
, bool speed
)
12477 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
12480 && flag_aarch64_verbose_cost
)
12482 print_rtl_single (dump_file
, x
);
12483 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
12484 speed
? "Hot" : "Cold",
12485 *cost
, result
? "final" : "partial");
12492 aarch64_register_move_cost (machine_mode mode
,
12493 reg_class_t from_i
, reg_class_t to_i
)
12495 enum reg_class from
= (enum reg_class
) from_i
;
12496 enum reg_class to
= (enum reg_class
) to_i
;
12497 const struct cpu_regmove_cost
*regmove_cost
12498 = aarch64_tune_params
.regmove_cost
;
12500 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12501 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
12504 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
12505 from
= GENERAL_REGS
;
12507 /* Make RDFFR very expensive. In particular, if we know that the FFR
12508 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12509 as a way of obtaining a PTRUE. */
12510 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
12511 && hard_reg_set_subset_p (reg_class_contents
[from_i
],
12512 reg_class_contents
[FFR_REGS
]))
12515 /* Moving between GPR and stack cost is the same as GP2GP. */
12516 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
12517 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
12518 return regmove_cost
->GP2GP
;
12520 /* To/From the stack register, we move via the gprs. */
12521 if (to
== STACK_REG
|| from
== STACK_REG
)
12522 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
12523 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
12525 if (known_eq (GET_MODE_SIZE (mode
), 16))
12527 /* 128-bit operations on general registers require 2 instructions. */
12528 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
12529 return regmove_cost
->GP2GP
* 2;
12530 else if (from
== GENERAL_REGS
)
12531 return regmove_cost
->GP2FP
* 2;
12532 else if (to
== GENERAL_REGS
)
12533 return regmove_cost
->FP2GP
* 2;
12535 /* When AdvSIMD instructions are disabled it is not possible to move
12536 a 128-bit value directly between Q registers. This is handled in
12537 secondary reload. A general register is used as a scratch to move
12538 the upper DI value and the lower DI value is moved directly,
12539 hence the cost is the sum of three moves. */
12541 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
12543 return regmove_cost
->FP2FP
;
12546 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
12547 return regmove_cost
->GP2GP
;
12548 else if (from
== GENERAL_REGS
)
12549 return regmove_cost
->GP2FP
;
12550 else if (to
== GENERAL_REGS
)
12551 return regmove_cost
->FP2GP
;
12553 return regmove_cost
->FP2FP
;
12557 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
12558 reg_class_t rclass ATTRIBUTE_UNUSED
,
12559 bool in ATTRIBUTE_UNUSED
)
12561 return aarch64_tune_params
.memmov_cost
;
12564 /* Implement TARGET_INIT_BUILTINS. */
12566 aarch64_init_builtins ()
12568 aarch64_general_init_builtins ();
12569 aarch64_sve::init_builtins ();
12572 /* Implement TARGET_FOLD_BUILTIN. */
12574 aarch64_fold_builtin (tree fndecl
, int nargs
, tree
*args
, bool)
12576 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12577 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12578 tree type
= TREE_TYPE (TREE_TYPE (fndecl
));
12579 switch (code
& AARCH64_BUILTIN_CLASS
)
12581 case AARCH64_BUILTIN_GENERAL
:
12582 return aarch64_general_fold_builtin (subcode
, type
, nargs
, args
);
12584 case AARCH64_BUILTIN_SVE
:
12587 gcc_unreachable ();
12590 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12592 aarch64_gimple_fold_builtin (gimple_stmt_iterator
*gsi
)
12594 gcall
*stmt
= as_a
<gcall
*> (gsi_stmt (*gsi
));
12595 tree fndecl
= gimple_call_fndecl (stmt
);
12596 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12597 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12598 gimple
*new_stmt
= NULL
;
12599 switch (code
& AARCH64_BUILTIN_CLASS
)
12601 case AARCH64_BUILTIN_GENERAL
:
12602 new_stmt
= aarch64_general_gimple_fold_builtin (subcode
, stmt
);
12605 case AARCH64_BUILTIN_SVE
:
12606 new_stmt
= aarch64_sve::gimple_fold_builtin (subcode
, gsi
, stmt
);
12613 gsi_replace (gsi
, new_stmt
, true);
12617 /* Implement TARGET_EXPAND_BUILTIN. */
12619 aarch64_expand_builtin (tree exp
, rtx target
, rtx
, machine_mode
, int ignore
)
12621 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
12622 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12623 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12624 switch (code
& AARCH64_BUILTIN_CLASS
)
12626 case AARCH64_BUILTIN_GENERAL
:
12627 return aarch64_general_expand_builtin (subcode
, exp
, target
, ignore
);
12629 case AARCH64_BUILTIN_SVE
:
12630 return aarch64_sve::expand_builtin (subcode
, exp
, target
);
12632 gcc_unreachable ();
12635 /* Implement TARGET_BUILTIN_DECL. */
12637 aarch64_builtin_decl (unsigned int code
, bool initialize_p
)
12639 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12640 switch (code
& AARCH64_BUILTIN_CLASS
)
12642 case AARCH64_BUILTIN_GENERAL
:
12643 return aarch64_general_builtin_decl (subcode
, initialize_p
);
12645 case AARCH64_BUILTIN_SVE
:
12646 return aarch64_sve::builtin_decl (subcode
, initialize_p
);
12648 gcc_unreachable ();
12651 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12652 to optimize 1.0/sqrt. */
12655 use_rsqrt_p (machine_mode mode
)
12657 return (!flag_trapping_math
12658 && flag_unsafe_math_optimizations
12659 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
12660 & AARCH64_APPROX_MODE (mode
))
12661 || flag_mrecip_low_precision_sqrt
));
12664 /* Function to decide when to use the approximate reciprocal square root
12668 aarch64_builtin_reciprocal (tree fndecl
)
12670 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
12672 if (!use_rsqrt_p (mode
))
12674 unsigned int code
= DECL_MD_FUNCTION_CODE (fndecl
);
12675 unsigned int subcode
= code
>> AARCH64_BUILTIN_SHIFT
;
12676 switch (code
& AARCH64_BUILTIN_CLASS
)
12678 case AARCH64_BUILTIN_GENERAL
:
12679 return aarch64_general_builtin_rsqrt (subcode
);
12681 case AARCH64_BUILTIN_SVE
:
12684 gcc_unreachable ();
12687 /* Emit instruction sequence to compute either the approximate square root
12688 or its approximate reciprocal, depending on the flag RECP, and return
12689 whether the sequence was emitted or not. */
12692 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
12694 machine_mode mode
= GET_MODE (dst
);
12696 if (GET_MODE_INNER (mode
) == HFmode
)
12698 gcc_assert (!recp
);
12704 if (!(flag_mlow_precision_sqrt
12705 || (aarch64_tune_params
.approx_modes
->sqrt
12706 & AARCH64_APPROX_MODE (mode
))))
12709 if (flag_finite_math_only
12710 || flag_trapping_math
12711 || !flag_unsafe_math_optimizations
12712 || optimize_function_for_size_p (cfun
))
12716 /* Caller assumes we cannot fail. */
12717 gcc_assert (use_rsqrt_p (mode
));
12719 machine_mode mmsk
= (VECTOR_MODE_P (mode
)
12720 ? related_int_vector_mode (mode
).require ()
12721 : int_mode_for_mode (mode
).require ());
12722 rtx xmsk
= gen_reg_rtx (mmsk
);
12724 /* When calculating the approximate square root, compare the
12725 argument with 0.0 and create a mask. */
12726 emit_insn (gen_rtx_SET (xmsk
,
12728 gen_rtx_EQ (mmsk
, src
,
12729 CONST0_RTX (mode
)))));
12731 /* Estimate the approximate reciprocal square root. */
12732 rtx xdst
= gen_reg_rtx (mode
);
12733 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
12735 /* Iterate over the series twice for SF and thrice for DF. */
12736 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
12738 /* Optionally iterate over the series once less for faster performance
12739 while sacrificing the accuracy. */
12740 if ((recp
&& flag_mrecip_low_precision_sqrt
)
12741 || (!recp
&& flag_mlow_precision_sqrt
))
12744 /* Iterate over the series to calculate the approximate reciprocal square
12746 rtx x1
= gen_reg_rtx (mode
);
12747 while (iterations
--)
12749 rtx x2
= gen_reg_rtx (mode
);
12750 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
12752 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
12754 if (iterations
> 0)
12755 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
12760 /* Qualify the approximate reciprocal square root when the argument is
12761 0.0 by squashing the intermediary result to 0.0. */
12762 rtx xtmp
= gen_reg_rtx (mmsk
);
12763 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
12764 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
12765 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
12767 /* Calculate the approximate square root. */
12768 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
12771 /* Finalize the approximation. */
12772 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
12777 /* Emit the instruction sequence to compute the approximation for the division
12778 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12781 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
12783 machine_mode mode
= GET_MODE (quo
);
12785 if (GET_MODE_INNER (mode
) == HFmode
)
12788 bool use_approx_division_p
= (flag_mlow_precision_div
12789 || (aarch64_tune_params
.approx_modes
->division
12790 & AARCH64_APPROX_MODE (mode
)));
12792 if (!flag_finite_math_only
12793 || flag_trapping_math
12794 || !flag_unsafe_math_optimizations
12795 || optimize_function_for_size_p (cfun
)
12796 || !use_approx_division_p
)
12799 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
12802 /* Estimate the approximate reciprocal. */
12803 rtx xrcp
= gen_reg_rtx (mode
);
12804 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
12806 /* Iterate over the series twice for SF and thrice for DF. */
12807 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
12809 /* Optionally iterate over the series once less for faster performance,
12810 while sacrificing the accuracy. */
12811 if (flag_mlow_precision_div
)
12814 /* Iterate over the series to calculate the approximate reciprocal. */
12815 rtx xtmp
= gen_reg_rtx (mode
);
12816 while (iterations
--)
12818 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
12820 if (iterations
> 0)
12821 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
12824 if (num
!= CONST1_RTX (mode
))
12826 /* As the approximate reciprocal of DEN is already calculated, only
12827 calculate the approximate division when NUM is not 1.0. */
12828 rtx xnum
= force_reg (mode
, num
);
12829 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
12832 /* Finalize the approximation. */
12833 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
12837 /* Return the number of instructions that can be issued per cycle. */
12839 aarch64_sched_issue_rate (void)
12841 return aarch64_tune_params
.issue_rate
;
12844 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12846 aarch64_sched_variable_issue (FILE *, int, rtx_insn
*insn
, int more
)
12848 if (DEBUG_INSN_P (insn
))
12851 rtx_code code
= GET_CODE (PATTERN (insn
));
12852 if (code
== USE
|| code
== CLOBBER
)
12855 if (get_attr_type (insn
) == TYPE_NO_INSN
)
12862 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12864 int issue_rate
= aarch64_sched_issue_rate ();
12866 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
12870 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12871 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12872 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
12875 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
12878 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
12882 /* Vectorizer cost model target hooks. */
12884 /* Implement targetm.vectorize.builtin_vectorization_cost. */
12886 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
12888 int misalign ATTRIBUTE_UNUSED
)
12891 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
12894 if (vectype
!= NULL
)
12895 fp
= FLOAT_TYPE_P (vectype
);
12897 switch (type_of_cost
)
12900 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
12903 return costs
->scalar_load_cost
;
12906 return costs
->scalar_store_cost
;
12909 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
12912 return costs
->vec_align_load_cost
;
12915 return costs
->vec_store_cost
;
12917 case vec_to_scalar
:
12918 return costs
->vec_to_scalar_cost
;
12920 case scalar_to_vec
:
12921 return costs
->scalar_to_vec_cost
;
12923 case unaligned_load
:
12924 case vector_gather_load
:
12925 return costs
->vec_unalign_load_cost
;
12927 case unaligned_store
:
12928 case vector_scatter_store
:
12929 return costs
->vec_unalign_store_cost
;
12931 case cond_branch_taken
:
12932 return costs
->cond_taken_branch_cost
;
12934 case cond_branch_not_taken
:
12935 return costs
->cond_not_taken_branch_cost
;
12938 return costs
->vec_permute_cost
;
12940 case vec_promote_demote
:
12941 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
12943 case vec_construct
:
12944 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
12945 return elements
/ 2 + 1;
12948 gcc_unreachable ();
12952 /* Return true if STMT_INFO extends the result of a load. */
12954 aarch64_extending_load_p (stmt_vec_info stmt_info
)
12956 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
12957 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
12960 tree rhs
= gimple_assign_rhs1 (stmt_info
->stmt
);
12961 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
12962 tree rhs_type
= TREE_TYPE (rhs
);
12963 if (!INTEGRAL_TYPE_P (lhs_type
)
12964 || !INTEGRAL_TYPE_P (rhs_type
)
12965 || TYPE_PRECISION (lhs_type
) <= TYPE_PRECISION (rhs_type
))
12968 stmt_vec_info def_stmt_info
= stmt_info
->vinfo
->lookup_def (rhs
);
12969 return (def_stmt_info
12970 && STMT_VINFO_DATA_REF (def_stmt_info
)
12971 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info
)));
12974 /* Return true if STMT_INFO is an integer truncation. */
12976 aarch64_integer_truncation_p (stmt_vec_info stmt_info
)
12978 gassign
*assign
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
12979 if (!assign
|| !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign
)))
12982 tree lhs_type
= TREE_TYPE (gimple_assign_lhs (assign
));
12983 tree rhs_type
= TREE_TYPE (gimple_assign_rhs1 (assign
));
12984 return (INTEGRAL_TYPE_P (lhs_type
)
12985 && INTEGRAL_TYPE_P (rhs_type
)
12986 && TYPE_PRECISION (lhs_type
) < TYPE_PRECISION (rhs_type
));
12989 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
12990 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
12991 for SVE targets. */
12992 static unsigned int
12993 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind
, stmt_vec_info stmt_info
,
12994 unsigned int stmt_cost
)
12996 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
12997 vector register size or number of units. Integer promotions of this
12998 type therefore map to SXT[BHW] or UXT[BHW].
13000 Most loads have extending forms that can do the sign or zero extension
13001 on the fly. Optimistically assume that a load followed by an extension
13002 will fold to this form during combine, and that the extension therefore
13004 if (kind
== vector_stmt
&& aarch64_extending_load_p (stmt_info
))
13007 /* For similar reasons, vector_stmt integer truncations are a no-op,
13008 because we can just ignore the unused upper bits of the source. */
13009 if (kind
== vector_stmt
&& aarch64_integer_truncation_p (stmt_info
))
13015 /* Implement targetm.vectorize.add_stmt_cost. */
13017 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
13018 struct _stmt_vec_info
*stmt_info
, int misalign
,
13019 enum vect_cost_model_location where
)
13021 unsigned *cost
= (unsigned *) data
;
13022 unsigned retval
= 0;
13024 if (flag_vect_cost_model
)
13026 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
13028 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
13030 if (stmt_info
&& vectype
&& aarch64_sve_mode_p (TYPE_MODE (vectype
)))
13031 stmt_cost
= aarch64_sve_adjust_stmt_cost (kind
, stmt_info
, stmt_cost
);
13033 /* Statements in an inner loop relative to the loop being
13034 vectorized are weighted more heavily. The value here is
13035 arbitrary and could potentially be improved with analysis. */
13036 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
13037 count
*= 50; /* FIXME */
13039 retval
= (unsigned) (count
* stmt_cost
);
13040 cost
[where
] += retval
;
13046 static void initialize_aarch64_code_model (struct gcc_options
*);
13048 /* Parse the TO_PARSE string and put the architecture struct that it
13049 selects into RES and the architectural features into ISA_FLAGS.
13050 Return an aarch64_parse_opt_result describing the parse result.
13051 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13052 When the TO_PARSE string contains an invalid extension,
13053 a copy of the string is created and stored to INVALID_EXTENSION. */
13055 static enum aarch64_parse_opt_result
13056 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
13057 uint64_t *isa_flags
, std::string
*invalid_extension
)
13060 const struct processor
*arch
;
13063 ext
= strchr (to_parse
, '+');
13066 len
= ext
- to_parse
;
13068 len
= strlen (to_parse
);
13071 return AARCH64_PARSE_MISSING_ARG
;
13074 /* Loop through the list of supported ARCHes to find a match. */
13075 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
13077 if (strlen (arch
->name
) == len
13078 && strncmp (arch
->name
, to_parse
, len
) == 0)
13080 uint64_t isa_temp
= arch
->flags
;
13084 /* TO_PARSE string contains at least one extension. */
13085 enum aarch64_parse_opt_result ext_res
13086 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13088 if (ext_res
!= AARCH64_PARSE_OK
)
13091 /* Extension parsing was successful. Confirm the result
13092 arch and ISA flags. */
13094 *isa_flags
= isa_temp
;
13095 return AARCH64_PARSE_OK
;
13099 /* ARCH name not found in list. */
13100 return AARCH64_PARSE_INVALID_ARG
;
13103 /* Parse the TO_PARSE string and put the result tuning in RES and the
13104 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13105 describing the parse result. If there is an error parsing, RES and
13106 ISA_FLAGS are left unchanged.
13107 When the TO_PARSE string contains an invalid extension,
13108 a copy of the string is created and stored to INVALID_EXTENSION. */
13110 static enum aarch64_parse_opt_result
13111 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
13112 uint64_t *isa_flags
, std::string
*invalid_extension
)
13115 const struct processor
*cpu
;
13118 ext
= strchr (to_parse
, '+');
13121 len
= ext
- to_parse
;
13123 len
= strlen (to_parse
);
13126 return AARCH64_PARSE_MISSING_ARG
;
13129 /* Loop through the list of supported CPUs to find a match. */
13130 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13132 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
13134 uint64_t isa_temp
= cpu
->flags
;
13139 /* TO_PARSE string contains at least one extension. */
13140 enum aarch64_parse_opt_result ext_res
13141 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
13143 if (ext_res
!= AARCH64_PARSE_OK
)
13146 /* Extension parsing was successfull. Confirm the result
13147 cpu and ISA flags. */
13149 *isa_flags
= isa_temp
;
13150 return AARCH64_PARSE_OK
;
13154 /* CPU name not found in list. */
13155 return AARCH64_PARSE_INVALID_ARG
;
13158 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13159 Return an aarch64_parse_opt_result describing the parse result.
13160 If the parsing fails the RES does not change. */
13162 static enum aarch64_parse_opt_result
13163 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
13165 const struct processor
*cpu
;
13167 /* Loop through the list of supported CPUs to find a match. */
13168 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
13170 if (strcmp (cpu
->name
, to_parse
) == 0)
13173 return AARCH64_PARSE_OK
;
13177 /* CPU name not found in list. */
13178 return AARCH64_PARSE_INVALID_ARG
;
13181 /* Parse TOKEN, which has length LENGTH to see if it is an option
13182 described in FLAG. If it is, return the index bit for that fusion type.
13183 If not, error (printing OPTION_NAME) and return zero. */
13185 static unsigned int
13186 aarch64_parse_one_option_token (const char *token
,
13188 const struct aarch64_flag_desc
*flag
,
13189 const char *option_name
)
13191 for (; flag
->name
!= NULL
; flag
++)
13193 if (length
== strlen (flag
->name
)
13194 && !strncmp (flag
->name
, token
, length
))
13198 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
13202 /* Parse OPTION which is a comma-separated list of flags to enable.
13203 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13204 default state we inherit from the CPU tuning structures. OPTION_NAME
13205 gives the top-level option we are parsing in the -moverride string,
13206 for use in error messages. */
13208 static unsigned int
13209 aarch64_parse_boolean_options (const char *option
,
13210 const struct aarch64_flag_desc
*flags
,
13211 unsigned int initial_state
,
13212 const char *option_name
)
13214 const char separator
= '.';
13215 const char* specs
= option
;
13216 const char* ntoken
= option
;
13217 unsigned int found_flags
= initial_state
;
13219 while ((ntoken
= strchr (specs
, separator
)))
13221 size_t token_length
= ntoken
- specs
;
13222 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13226 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13227 in the token stream, reset the supported operations. So:
13229 adrp+add.cmp+branch.none.adrp+add
13231 would have the result of turning on only adrp+add fusion. */
13235 found_flags
|= token_ops
;
13239 /* We ended with a comma, print something. */
13242 error ("%s string ill-formed\n", option_name
);
13246 /* We still have one more token to parse. */
13247 size_t token_length
= strlen (specs
);
13248 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
13255 found_flags
|= token_ops
;
13256 return found_flags
;
13259 /* Support for overriding instruction fusion. */
13262 aarch64_parse_fuse_string (const char *fuse_string
,
13263 struct tune_params
*tune
)
13265 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
13266 aarch64_fusible_pairs
,
13271 /* Support for overriding other tuning flags. */
13274 aarch64_parse_tune_string (const char *tune_string
,
13275 struct tune_params
*tune
)
13277 tune
->extra_tuning_flags
13278 = aarch64_parse_boolean_options (tune_string
,
13279 aarch64_tuning_flags
,
13280 tune
->extra_tuning_flags
,
13284 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13285 Accept the valid SVE vector widths allowed by
13286 aarch64_sve_vector_bits_enum and use it to override sve_width
13290 aarch64_parse_sve_width_string (const char *tune_string
,
13291 struct tune_params
*tune
)
13295 int n
= sscanf (tune_string
, "%d", &width
);
13298 error ("invalid format for sve_width");
13310 error ("invalid sve_width value: %d", width
);
13312 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
13315 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13316 we understand. If it is, extract the option string and handoff to
13317 the appropriate function. */
13320 aarch64_parse_one_override_token (const char* token
,
13322 struct tune_params
*tune
)
13324 const struct aarch64_tuning_override_function
*fn
13325 = aarch64_tuning_override_functions
;
13327 const char *option_part
= strchr (token
, '=');
13330 error ("tuning string missing in option (%s)", token
);
13334 /* Get the length of the option name. */
13335 length
= option_part
- token
;
13336 /* Skip the '=' to get to the option string. */
13339 for (; fn
->name
!= NULL
; fn
++)
13341 if (!strncmp (fn
->name
, token
, length
))
13343 fn
->parse_override (option_part
, tune
);
13348 error ("unknown tuning option (%s)",token
);
13352 /* A checking mechanism for the implementation of the tls size. */
13355 initialize_aarch64_tls_size (struct gcc_options
*opts
)
13357 if (aarch64_tls_size
== 0)
13358 aarch64_tls_size
= 24;
13360 switch (opts
->x_aarch64_cmodel_var
)
13362 case AARCH64_CMODEL_TINY
:
13363 /* Both the default and maximum TLS size allowed under tiny is 1M which
13364 needs two instructions to address, so we clamp the size to 24. */
13365 if (aarch64_tls_size
> 24)
13366 aarch64_tls_size
= 24;
13368 case AARCH64_CMODEL_SMALL
:
13369 /* The maximum TLS size allowed under small is 4G. */
13370 if (aarch64_tls_size
> 32)
13371 aarch64_tls_size
= 32;
13373 case AARCH64_CMODEL_LARGE
:
13374 /* The maximum TLS size allowed under large is 16E.
13375 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13376 if (aarch64_tls_size
> 48)
13377 aarch64_tls_size
= 48;
13380 gcc_unreachable ();
13386 /* Parse STRING looking for options in the format:
13387 string :: option:string
13388 option :: name=substring
13390 substring :: defined by option. */
13393 aarch64_parse_override_string (const char* input_string
,
13394 struct tune_params
* tune
)
13396 const char separator
= ':';
13397 size_t string_length
= strlen (input_string
) + 1;
13398 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
13399 char *string
= string_root
;
13400 strncpy (string
, input_string
, string_length
);
13401 string
[string_length
- 1] = '\0';
13403 char* ntoken
= string
;
13405 while ((ntoken
= strchr (string
, separator
)))
13407 size_t token_length
= ntoken
- string
;
13408 /* Make this substring look like a string. */
13410 aarch64_parse_one_override_token (string
, token_length
, tune
);
13414 /* One last option to parse. */
13415 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
13416 free (string_root
);
13421 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
13423 if (accepted_branch_protection_string
)
13425 opts
->x_aarch64_branch_protection_string
13426 = xstrdup (accepted_branch_protection_string
);
13429 /* PR 70044: We have to be careful about being called multiple times for the
13430 same function. This means all changes should be repeatable. */
13432 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13433 Disable the frame pointer flag so the mid-end will not use a frame
13434 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13435 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13436 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13437 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
13438 if (opts
->x_flag_omit_frame_pointer
== 0)
13439 opts
->x_flag_omit_frame_pointer
= 2;
13441 /* If not optimizing for size, set the default
13442 alignment to what the target wants. */
13443 if (!opts
->x_optimize_size
)
13445 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
13446 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
13447 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
13448 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
13449 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
13450 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
13453 /* We default to no pc-relative literal loads. */
13455 aarch64_pcrelative_literal_loads
= false;
13457 /* If -mpc-relative-literal-loads is set on the command line, this
13458 implies that the user asked for PC relative literal loads. */
13459 if (opts
->x_pcrelative_literal_loads
== 1)
13460 aarch64_pcrelative_literal_loads
= true;
13462 /* In the tiny memory model it makes no sense to disallow PC relative
13463 literal pool loads. */
13464 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
13465 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
13466 aarch64_pcrelative_literal_loads
= true;
13468 /* When enabling the lower precision Newton series for the square root, also
13469 enable it for the reciprocal square root, since the latter is an
13470 intermediary step for the former. */
13471 if (flag_mlow_precision_sqrt
)
13472 flag_mrecip_low_precision_sqrt
= true;
13475 /* 'Unpack' up the internal tuning structs and update the options
13476 in OPTS. The caller must have set up selected_tune and selected_arch
13477 as all the other target-specific codegen decisions are
13478 derived from them. */
13481 aarch64_override_options_internal (struct gcc_options
*opts
)
13483 aarch64_tune_flags
= selected_tune
->flags
;
13484 aarch64_tune
= selected_tune
->sched_core
;
13485 /* Make a copy of the tuning parameters attached to the core, which
13486 we may later overwrite. */
13487 aarch64_tune_params
= *(selected_tune
->tune
);
13488 aarch64_architecture_version
= selected_arch
->architecture_version
;
13490 if (opts
->x_aarch64_override_tune_string
)
13491 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
13492 &aarch64_tune_params
);
13494 /* This target defaults to strict volatile bitfields. */
13495 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
13496 opts
->x_flag_strict_volatile_bitfields
= 1;
13498 if (aarch64_stack_protector_guard
== SSP_GLOBAL
13499 && opts
->x_aarch64_stack_protector_guard_offset_str
)
13501 error ("incompatible options %<-mstack-protector-guard=global%> and "
13502 "%<-mstack-protector-guard-offset=%s%>",
13503 aarch64_stack_protector_guard_offset_str
);
13506 if (aarch64_stack_protector_guard
== SSP_SYSREG
13507 && !(opts
->x_aarch64_stack_protector_guard_offset_str
13508 && opts
->x_aarch64_stack_protector_guard_reg_str
))
13510 error ("both %<-mstack-protector-guard-offset%> and "
13511 "%<-mstack-protector-guard-reg%> must be used "
13512 "with %<-mstack-protector-guard=sysreg%>");
13515 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
13517 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
13518 error ("specify a system register with a small string length.");
13521 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
13524 const char *str
= aarch64_stack_protector_guard_offset_str
;
13526 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
13527 if (!*str
|| *end
|| errno
)
13528 error ("%qs is not a valid offset in %qs", str
,
13529 "-mstack-protector-guard-offset=");
13530 aarch64_stack_protector_guard_offset
= offs
;
13533 initialize_aarch64_code_model (opts
);
13534 initialize_aarch64_tls_size (opts
);
13536 int queue_depth
= 0;
13537 switch (aarch64_tune_params
.autoprefetcher_model
)
13539 case tune_params::AUTOPREFETCHER_OFF
:
13542 case tune_params::AUTOPREFETCHER_WEAK
:
13545 case tune_params::AUTOPREFETCHER_STRONG
:
13546 queue_depth
= max_insn_queue_index
+ 1;
13549 gcc_unreachable ();
13552 /* We don't mind passing in global_options_set here as we don't use
13553 the *options_set structs anyway. */
13554 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13555 param_sched_autopref_queue_depth
, queue_depth
);
13557 /* Set up parameters to be used in prefetching algorithm. Do not
13558 override the defaults unless we are tuning for a core we have
13559 researched values for. */
13560 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
13561 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13562 param_simultaneous_prefetches
,
13563 aarch64_tune_params
.prefetch
->num_slots
);
13564 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
13565 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13566 param_l1_cache_size
,
13567 aarch64_tune_params
.prefetch
->l1_cache_size
);
13568 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
13569 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13570 param_l1_cache_line_size
,
13571 aarch64_tune_params
.prefetch
->l1_cache_line_size
);
13572 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
13573 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13574 param_l2_cache_size
,
13575 aarch64_tune_params
.prefetch
->l2_cache_size
);
13576 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
13577 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13578 param_prefetch_dynamic_strides
, 0);
13579 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
13580 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13581 param_prefetch_minimum_stride
,
13582 aarch64_tune_params
.prefetch
->minimum_stride
);
13584 /* Use the alternative scheduling-pressure algorithm by default. */
13585 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13586 param_sched_pressure_algorithm
,
13587 SCHED_PRESSURE_MODEL
);
13589 /* Validate the guard size. */
13590 int guard_size
= param_stack_clash_protection_guard_size
;
13592 if (guard_size
!= 12 && guard_size
!= 16)
13593 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13594 "size. Given value %d (%llu KB) is out of range",
13595 guard_size
, (1ULL << guard_size
) / 1024ULL);
13597 /* Enforce that interval is the same size as size so the mid-end does the
13599 SET_OPTION_IF_UNSET (opts
, &global_options_set
,
13600 param_stack_clash_protection_probe_interval
,
13603 /* The maybe_set calls won't update the value if the user has explicitly set
13604 one. Which means we need to validate that probing interval and guard size
13607 = param_stack_clash_protection_probe_interval
;
13608 if (guard_size
!= probe_interval
)
13609 error ("stack clash guard size %<%d%> must be equal to probing interval "
13610 "%<%d%>", guard_size
, probe_interval
);
13612 /* Enable sw prefetching at specified optimization level for
13613 CPUS that have prefetch. Lower optimization level threshold by 1
13614 when profiling is enabled. */
13615 if (opts
->x_flag_prefetch_loop_arrays
< 0
13616 && !opts
->x_optimize_size
13617 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
13618 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
13619 opts
->x_flag_prefetch_loop_arrays
= 1;
13621 if (opts
->x_aarch64_arch_string
== NULL
)
13622 opts
->x_aarch64_arch_string
= selected_arch
->name
;
13623 if (opts
->x_aarch64_cpu_string
== NULL
)
13624 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
13625 if (opts
->x_aarch64_tune_string
== NULL
)
13626 opts
->x_aarch64_tune_string
= selected_tune
->name
;
13628 aarch64_override_options_after_change_1 (opts
);
13631 /* Print a hint with a suggestion for a core or architecture name that
13632 most closely resembles what the user passed in STR. ARCH is true if
13633 the user is asking for an architecture name. ARCH is false if the user
13634 is asking for a core name. */
13637 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
13639 auto_vec
<const char *> candidates
;
13640 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
13641 for (; entry
->name
!= NULL
; entry
++)
13642 candidates
.safe_push (entry
->name
);
13644 #ifdef HAVE_LOCAL_CPU_DETECT
13645 /* Add also "native" as possible value. */
13647 candidates
.safe_push ("native");
13651 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
13653 inform (input_location
, "valid arguments are: %s;"
13654 " did you mean %qs?", s
, hint
);
13656 inform (input_location
, "valid arguments are: %s", s
);
13661 /* Print a hint with a suggestion for a core name that most closely resembles
13662 what the user passed in STR. */
13665 aarch64_print_hint_for_core (const char *str
)
13667 aarch64_print_hint_for_core_or_arch (str
, false);
13670 /* Print a hint with a suggestion for an architecture name that most closely
13671 resembles what the user passed in STR. */
13674 aarch64_print_hint_for_arch (const char *str
)
13676 aarch64_print_hint_for_core_or_arch (str
, true);
13680 /* Print a hint with a suggestion for an extension name
13681 that most closely resembles what the user passed in STR. */
13684 aarch64_print_hint_for_extensions (const std::string
&str
)
13686 auto_vec
<const char *> candidates
;
13687 aarch64_get_all_extension_candidates (&candidates
);
13689 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
13691 inform (input_location
, "valid arguments are: %s;"
13692 " did you mean %qs?", s
, hint
);
13694 inform (input_location
, "valid arguments are: %s;", s
);
13699 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13700 specified in STR and throw errors if appropriate. Put the results if
13701 they are valid in RES and ISA_FLAGS. Return whether the option is
13705 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
13706 uint64_t *isa_flags
)
13708 std::string invalid_extension
;
13709 enum aarch64_parse_opt_result parse_res
13710 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
13712 if (parse_res
== AARCH64_PARSE_OK
)
13717 case AARCH64_PARSE_MISSING_ARG
:
13718 error ("missing cpu name in %<-mcpu=%s%>", str
);
13720 case AARCH64_PARSE_INVALID_ARG
:
13721 error ("unknown value %qs for %<-mcpu%>", str
);
13722 aarch64_print_hint_for_core (str
);
13724 case AARCH64_PARSE_INVALID_FEATURE
:
13725 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13726 invalid_extension
.c_str (), str
);
13727 aarch64_print_hint_for_extensions (invalid_extension
);
13730 gcc_unreachable ();
13736 /* Parses CONST_STR for branch protection features specified in
13737 aarch64_branch_protect_types, and set any global variables required. Returns
13738 the parsing result and assigns LAST_STR to the last processed token from
13739 CONST_STR so that it can be used for error reporting. */
13742 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
13745 char *str_root
= xstrdup (const_str
);
13746 char* token_save
= NULL
;
13747 char *str
= strtok_r (str_root
, "+", &token_save
);
13748 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
13750 res
= AARCH64_PARSE_MISSING_ARG
;
13753 char *next_str
= strtok_r (NULL
, "+", &token_save
);
13754 /* Reset the branch protection features to their defaults. */
13755 aarch64_handle_no_branch_protection (NULL
, NULL
);
13757 while (str
&& res
== AARCH64_PARSE_OK
)
13759 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
13760 bool found
= false;
13761 /* Search for this type. */
13762 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
13764 if (strcmp (str
, type
->name
) == 0)
13767 res
= type
->handler (str
, next_str
);
13769 next_str
= strtok_r (NULL
, "+", &token_save
);
13774 if (found
&& res
== AARCH64_PARSE_OK
)
13776 bool found_subtype
= true;
13777 /* Loop through each token until we find one that isn't a
13779 while (found_subtype
)
13781 found_subtype
= false;
13782 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
13783 /* Search for the subtype. */
13784 while (str
&& subtype
&& subtype
->name
&& !found_subtype
13785 && res
== AARCH64_PARSE_OK
)
13787 if (strcmp (str
, subtype
->name
) == 0)
13789 found_subtype
= true;
13790 res
= subtype
->handler (str
, next_str
);
13792 next_str
= strtok_r (NULL
, "+", &token_save
);
13800 res
= AARCH64_PARSE_INVALID_ARG
;
13803 /* Copy the last processed token into the argument to pass it back.
13804 Used by option and attribute validation to print the offending token. */
13807 if (str
) strcpy (*last_str
, str
);
13808 else *last_str
= NULL
;
13810 if (res
== AARCH64_PARSE_OK
)
13812 /* If needed, alloc the accepted string then copy in const_str.
13813 Used by override_option_after_change_1. */
13814 if (!accepted_branch_protection_string
)
13815 accepted_branch_protection_string
= (char *) xmalloc (
13816 BRANCH_PROTECT_STR_MAX
13818 strncpy (accepted_branch_protection_string
, const_str
,
13819 BRANCH_PROTECT_STR_MAX
+ 1);
13820 /* Forcibly null-terminate. */
13821 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
13827 aarch64_validate_mbranch_protection (const char *const_str
)
13829 char *str
= (char *) xmalloc (strlen (const_str
));
13830 enum aarch64_parse_opt_result res
=
13831 aarch64_parse_branch_protection (const_str
, &str
);
13832 if (res
== AARCH64_PARSE_INVALID_ARG
)
13833 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
13834 else if (res
== AARCH64_PARSE_MISSING_ARG
)
13835 error ("missing argument for %<-mbranch-protection=%>");
13837 return res
== AARCH64_PARSE_OK
;
13840 /* Validate a command-line -march option. Parse the arch and extensions
13841 (if any) specified in STR and throw errors if appropriate. Put the
13842 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13843 option is valid. */
13846 aarch64_validate_march (const char *str
, const struct processor
**res
,
13847 uint64_t *isa_flags
)
13849 std::string invalid_extension
;
13850 enum aarch64_parse_opt_result parse_res
13851 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
13853 if (parse_res
== AARCH64_PARSE_OK
)
13858 case AARCH64_PARSE_MISSING_ARG
:
13859 error ("missing arch name in %<-march=%s%>", str
);
13861 case AARCH64_PARSE_INVALID_ARG
:
13862 error ("unknown value %qs for %<-march%>", str
);
13863 aarch64_print_hint_for_arch (str
);
13865 case AARCH64_PARSE_INVALID_FEATURE
:
13866 error ("invalid feature modifier %qs in %<-march=%s%>",
13867 invalid_extension
.c_str (), str
);
13868 aarch64_print_hint_for_extensions (invalid_extension
);
13871 gcc_unreachable ();
13877 /* Validate a command-line -mtune option. Parse the cpu
13878 specified in STR and throw errors if appropriate. Put the
13879 result, if it is valid, in RES. Return whether the option is
13883 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
13885 enum aarch64_parse_opt_result parse_res
13886 = aarch64_parse_tune (str
, res
);
13888 if (parse_res
== AARCH64_PARSE_OK
)
13893 case AARCH64_PARSE_MISSING_ARG
:
13894 error ("missing cpu name in %<-mtune=%s%>", str
);
13896 case AARCH64_PARSE_INVALID_ARG
:
13897 error ("unknown value %qs for %<-mtune%>", str
);
13898 aarch64_print_hint_for_core (str
);
13901 gcc_unreachable ();
13906 /* Return the CPU corresponding to the enum CPU.
13907 If it doesn't specify a cpu, return the default. */
13909 static const struct processor
*
13910 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
13912 if (cpu
!= aarch64_none
)
13913 return &all_cores
[cpu
];
13915 /* The & 0x3f is to extract the bottom 6 bits that encode the
13916 default cpu as selected by the --with-cpu GCC configure option
13918 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13919 flags mechanism should be reworked to make it more sane. */
13920 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
13923 /* Return the architecture corresponding to the enum ARCH.
13924 If it doesn't specify a valid architecture, return the default. */
13926 static const struct processor
*
13927 aarch64_get_arch (enum aarch64_arch arch
)
13929 if (arch
!= aarch64_no_arch
)
13930 return &all_architectures
[arch
];
13932 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
13934 return &all_architectures
[cpu
->arch
];
13937 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
13940 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
13942 /* 128-bit SVE and Advanced SIMD modes use different register layouts
13943 on big-endian targets, so we would need to forbid subregs that convert
13944 from one to the other. By default a reinterpret sequence would then
13945 involve a store to memory in one mode and a load back in the other.
13946 Even if we optimize that sequence using reverse instructions,
13947 it would still be a significant potential overhead.
13949 For now, it seems better to generate length-agnostic code for that
13951 if (value
== SVE_SCALABLE
13952 || (value
== SVE_128
&& BYTES_BIG_ENDIAN
))
13953 return poly_uint16 (2, 2);
13955 return (int) value
/ 64;
13958 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
13959 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13960 tuning structs. In particular it must set selected_tune and
13961 aarch64_isa_flags that define the available ISA features and tuning
13962 decisions. It must also set selected_arch as this will be used to
13963 output the .arch asm tags for each function. */
13966 aarch64_override_options (void)
13968 uint64_t cpu_isa
= 0;
13969 uint64_t arch_isa
= 0;
13970 aarch64_isa_flags
= 0;
13972 bool valid_cpu
= true;
13973 bool valid_tune
= true;
13974 bool valid_arch
= true;
13976 selected_cpu
= NULL
;
13977 selected_arch
= NULL
;
13978 selected_tune
= NULL
;
13980 if (aarch64_branch_protection_string
)
13981 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
13983 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13984 If either of -march or -mtune is given, they override their
13985 respective component of -mcpu. */
13986 if (aarch64_cpu_string
)
13987 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
13990 if (aarch64_arch_string
)
13991 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
13994 if (aarch64_tune_string
)
13995 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
13997 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13998 SUBTARGET_OVERRIDE_OPTIONS
;
14001 /* If the user did not specify a processor, choose the default
14002 one for them. This will be the CPU set during configuration using
14003 --with-cpu, otherwise it is "generic". */
14008 selected_cpu
= &all_cores
[selected_arch
->ident
];
14009 aarch64_isa_flags
= arch_isa
;
14010 explicit_arch
= selected_arch
->arch
;
14014 /* Get default configure-time CPU. */
14015 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
14016 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
14020 explicit_tune_core
= selected_tune
->ident
;
14022 /* If both -mcpu and -march are specified check that they are architecturally
14023 compatible, warn if they're not and prefer the -march ISA flags. */
14024 else if (selected_arch
)
14026 if (selected_arch
->arch
!= selected_cpu
->arch
)
14028 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14029 all_architectures
[selected_cpu
->arch
].name
,
14030 selected_arch
->name
);
14032 aarch64_isa_flags
= arch_isa
;
14033 explicit_arch
= selected_arch
->arch
;
14034 explicit_tune_core
= selected_tune
? selected_tune
->ident
14035 : selected_cpu
->ident
;
14039 /* -mcpu but no -march. */
14040 aarch64_isa_flags
= cpu_isa
;
14041 explicit_tune_core
= selected_tune
? selected_tune
->ident
14042 : selected_cpu
->ident
;
14043 gcc_assert (selected_cpu
);
14044 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14045 explicit_arch
= selected_arch
->arch
;
14048 /* Set the arch as well as we will need it when outputing
14049 the .arch directive in assembly. */
14050 if (!selected_arch
)
14052 gcc_assert (selected_cpu
);
14053 selected_arch
= &all_architectures
[selected_cpu
->arch
];
14056 if (!selected_tune
)
14057 selected_tune
= selected_cpu
;
14059 if (aarch64_enable_bti
== 2)
14061 #ifdef TARGET_ENABLE_BTI
14062 aarch64_enable_bti
= 1;
14064 aarch64_enable_bti
= 0;
14068 /* Return address signing is currently not supported for ILP32 targets. For
14069 LP64 targets use the configured option in the absence of a command-line
14070 option for -mbranch-protection. */
14071 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
14073 #ifdef TARGET_ENABLE_PAC_RET
14074 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
14076 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
14080 #ifndef HAVE_AS_MABI_OPTION
14081 /* The compiler may have been configured with 2.23.* binutils, which does
14082 not have support for ILP32. */
14084 error ("assembler does not support %<-mabi=ilp32%>");
14087 /* Convert -msve-vector-bits to a VG count. */
14088 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
14090 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
14091 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14093 /* Make sure we properly set up the explicit options. */
14094 if ((aarch64_cpu_string
&& valid_cpu
)
14095 || (aarch64_tune_string
&& valid_tune
))
14096 gcc_assert (explicit_tune_core
!= aarch64_none
);
14098 if ((aarch64_cpu_string
&& valid_cpu
)
14099 || (aarch64_arch_string
&& valid_arch
))
14100 gcc_assert (explicit_arch
!= aarch64_no_arch
);
14102 /* The pass to insert speculation tracking runs before
14103 shrink-wrapping and the latter does not know how to update the
14104 tracking status. So disable it in this case. */
14105 if (aarch64_track_speculation
)
14106 flag_shrink_wrap
= 0;
14108 aarch64_override_options_internal (&global_options
);
14110 /* Save these options as the default ones in case we push and pop them later
14111 while processing functions with potential target attributes. */
14112 target_option_default_node
= target_option_current_node
14113 = build_target_option_node (&global_options
);
14116 /* Implement targetm.override_options_after_change. */
14119 aarch64_override_options_after_change (void)
14121 aarch64_override_options_after_change_1 (&global_options
);
14124 static struct machine_function
*
14125 aarch64_init_machine_status (void)
14127 struct machine_function
*machine
;
14128 machine
= ggc_cleared_alloc
<machine_function
> ();
14133 aarch64_init_expanders (void)
14135 init_machine_status
= aarch64_init_machine_status
;
14138 /* A checking mechanism for the implementation of the various code models. */
14140 initialize_aarch64_code_model (struct gcc_options
*opts
)
14142 if (opts
->x_flag_pic
)
14144 switch (opts
->x_aarch64_cmodel_var
)
14146 case AARCH64_CMODEL_TINY
:
14147 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
14149 case AARCH64_CMODEL_SMALL
:
14150 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14151 aarch64_cmodel
= (flag_pic
== 2
14152 ? AARCH64_CMODEL_SMALL_PIC
14153 : AARCH64_CMODEL_SMALL_SPIC
);
14155 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
14158 case AARCH64_CMODEL_LARGE
:
14159 sorry ("code model %qs with %<-f%s%>", "large",
14160 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
14163 gcc_unreachable ();
14167 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
14170 /* Implement TARGET_OPTION_SAVE. */
14173 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
14175 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
14176 ptr
->x_aarch64_branch_protection_string
14177 = opts
->x_aarch64_branch_protection_string
;
14180 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14181 using the information saved in PTR. */
14184 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
14186 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
14187 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14188 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
14189 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14190 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
14191 opts
->x_aarch64_branch_protection_string
14192 = ptr
->x_aarch64_branch_protection_string
;
14193 if (opts
->x_aarch64_branch_protection_string
)
14195 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
14199 aarch64_override_options_internal (opts
);
14202 /* Implement TARGET_OPTION_PRINT. */
14205 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
14207 const struct processor
*cpu
14208 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
14209 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
14210 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
14211 std::string extension
14212 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
14214 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
14215 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
14216 arch
->name
, extension
.c_str ());
14219 static GTY(()) tree aarch64_previous_fndecl
;
14222 aarch64_reset_previous_fndecl (void)
14224 aarch64_previous_fndecl
= NULL
;
14227 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14228 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14229 make sure optab availability predicates are recomputed when necessary. */
14232 aarch64_save_restore_target_globals (tree new_tree
)
14234 if (TREE_TARGET_GLOBALS (new_tree
))
14235 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
14236 else if (new_tree
== target_option_default_node
)
14237 restore_target_globals (&default_target_globals
);
14239 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
14242 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14243 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14244 of the function, if such exists. This function may be called multiple
14245 times on a single function so use aarch64_previous_fndecl to avoid
14246 setting up identical state. */
14249 aarch64_set_current_function (tree fndecl
)
14251 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
14254 tree old_tree
= (aarch64_previous_fndecl
14255 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
14258 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14260 /* If current function has no attributes but the previous one did,
14261 use the default node. */
14262 if (!new_tree
&& old_tree
)
14263 new_tree
= target_option_default_node
;
14265 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14266 the default have been handled by aarch64_save_restore_target_globals from
14267 aarch64_pragma_target_parse. */
14268 if (old_tree
== new_tree
)
14271 aarch64_previous_fndecl
= fndecl
;
14273 /* First set the target options. */
14274 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
14276 aarch64_save_restore_target_globals (new_tree
);
14279 /* Enum describing the various ways we can handle attributes.
14280 In many cases we can reuse the generic option handling machinery. */
14282 enum aarch64_attr_opt_type
14284 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
14285 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
14286 aarch64_attr_enum
, /* Attribute sets an enum variable. */
14287 aarch64_attr_custom
/* Attribute requires a custom handling function. */
14290 /* All the information needed to handle a target attribute.
14291 NAME is the name of the attribute.
14292 ATTR_TYPE specifies the type of behavior of the attribute as described
14293 in the definition of enum aarch64_attr_opt_type.
14294 ALLOW_NEG is true if the attribute supports a "no-" form.
14295 HANDLER is the function that takes the attribute string as an argument
14296 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14297 OPT_NUM is the enum specifying the option that the attribute modifies.
14298 This is needed for attributes that mirror the behavior of a command-line
14299 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14300 aarch64_attr_enum. */
14302 struct aarch64_attribute_info
14305 enum aarch64_attr_opt_type attr_type
;
14307 bool (*handler
) (const char *);
14308 enum opt_code opt_num
;
14311 /* Handle the ARCH_STR argument to the arch= target attribute. */
14314 aarch64_handle_attr_arch (const char *str
)
14316 const struct processor
*tmp_arch
= NULL
;
14317 std::string invalid_extension
;
14318 enum aarch64_parse_opt_result parse_res
14319 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
14321 if (parse_res
== AARCH64_PARSE_OK
)
14323 gcc_assert (tmp_arch
);
14324 selected_arch
= tmp_arch
;
14325 explicit_arch
= selected_arch
->arch
;
14331 case AARCH64_PARSE_MISSING_ARG
:
14332 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14334 case AARCH64_PARSE_INVALID_ARG
:
14335 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
14336 aarch64_print_hint_for_arch (str
);
14338 case AARCH64_PARSE_INVALID_FEATURE
:
14339 error ("invalid feature modifier %s of value (\"%s\") in "
14340 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14341 aarch64_print_hint_for_extensions (invalid_extension
);
14344 gcc_unreachable ();
14350 /* Handle the argument CPU_STR to the cpu= target attribute. */
14353 aarch64_handle_attr_cpu (const char *str
)
14355 const struct processor
*tmp_cpu
= NULL
;
14356 std::string invalid_extension
;
14357 enum aarch64_parse_opt_result parse_res
14358 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
14360 if (parse_res
== AARCH64_PARSE_OK
)
14362 gcc_assert (tmp_cpu
);
14363 selected_tune
= tmp_cpu
;
14364 explicit_tune_core
= selected_tune
->ident
;
14366 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
14367 explicit_arch
= selected_arch
->arch
;
14373 case AARCH64_PARSE_MISSING_ARG
:
14374 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14376 case AARCH64_PARSE_INVALID_ARG
:
14377 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
14378 aarch64_print_hint_for_core (str
);
14380 case AARCH64_PARSE_INVALID_FEATURE
:
14381 error ("invalid feature modifier %s of value (\"%s\") in "
14382 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14383 aarch64_print_hint_for_extensions (invalid_extension
);
14386 gcc_unreachable ();
14392 /* Handle the argument STR to the branch-protection= attribute. */
14395 aarch64_handle_attr_branch_protection (const char* str
)
14397 char *err_str
= (char *) xmalloc (strlen (str
) + 1);
14398 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
14400 bool success
= false;
14403 case AARCH64_PARSE_MISSING_ARG
:
14404 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14407 case AARCH64_PARSE_INVALID_ARG
:
14408 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14409 "=\")%> pragma or attribute", err_str
);
14411 case AARCH64_PARSE_OK
:
14413 /* Fall through. */
14414 case AARCH64_PARSE_INVALID_FEATURE
:
14417 gcc_unreachable ();
14423 /* Handle the argument STR to the tune= target attribute. */
14426 aarch64_handle_attr_tune (const char *str
)
14428 const struct processor
*tmp_tune
= NULL
;
14429 enum aarch64_parse_opt_result parse_res
14430 = aarch64_parse_tune (str
, &tmp_tune
);
14432 if (parse_res
== AARCH64_PARSE_OK
)
14434 gcc_assert (tmp_tune
);
14435 selected_tune
= tmp_tune
;
14436 explicit_tune_core
= selected_tune
->ident
;
14442 case AARCH64_PARSE_INVALID_ARG
:
14443 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
14444 aarch64_print_hint_for_core (str
);
14447 gcc_unreachable ();
14453 /* Parse an architecture extensions target attribute string specified in STR.
14454 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14455 if successful. Update aarch64_isa_flags to reflect the ISA features
14459 aarch64_handle_attr_isa_flags (char *str
)
14461 enum aarch64_parse_opt_result parse_res
;
14462 uint64_t isa_flags
= aarch64_isa_flags
;
14464 /* We allow "+nothing" in the beginning to clear out all architectural
14465 features if the user wants to handpick specific features. */
14466 if (strncmp ("+nothing", str
, 8) == 0)
14472 std::string invalid_extension
;
14473 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
14475 if (parse_res
== AARCH64_PARSE_OK
)
14477 aarch64_isa_flags
= isa_flags
;
14483 case AARCH64_PARSE_MISSING_ARG
:
14484 error ("missing value in %<target()%> pragma or attribute");
14487 case AARCH64_PARSE_INVALID_FEATURE
:
14488 error ("invalid feature modifier %s of value (\"%s\") in "
14489 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
14493 gcc_unreachable ();
14499 /* The target attributes that we support. On top of these we also support just
14500 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14501 handled explicitly in aarch64_process_one_target_attr. */
14503 static const struct aarch64_attribute_info aarch64_attributes
[] =
14505 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
14506 OPT_mgeneral_regs_only
},
14507 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
14508 OPT_mfix_cortex_a53_835769
},
14509 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
14510 OPT_mfix_cortex_a53_843419
},
14511 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
14512 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
14513 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
14514 OPT_momit_leaf_frame_pointer
},
14515 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
14516 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
14518 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
14519 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
14521 { "branch-protection", aarch64_attr_custom
, false,
14522 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
14523 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
14524 OPT_msign_return_address_
},
14525 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
14528 /* Parse ARG_STR which contains the definition of one target attribute.
14529 Show appropriate errors if any or return true if the attribute is valid. */
14532 aarch64_process_one_target_attr (char *arg_str
)
14534 bool invert
= false;
14536 size_t len
= strlen (arg_str
);
14540 error ("malformed %<target()%> pragma or attribute");
14544 char *str_to_check
= (char *) alloca (len
+ 1);
14545 strcpy (str_to_check
, arg_str
);
14547 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14548 It is easier to detect and handle it explicitly here rather than going
14549 through the machinery for the rest of the target attributes in this
14551 if (*str_to_check
== '+')
14552 return aarch64_handle_attr_isa_flags (str_to_check
);
14554 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
14559 char *arg
= strchr (str_to_check
, '=');
14561 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14562 and point ARG to "foo". */
14568 const struct aarch64_attribute_info
*p_attr
;
14569 bool found
= false;
14570 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
14572 /* If the names don't match up, or the user has given an argument
14573 to an attribute that doesn't accept one, or didn't give an argument
14574 to an attribute that expects one, fail to match. */
14575 if (strcmp (str_to_check
, p_attr
->name
) != 0)
14579 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
14580 || p_attr
->attr_type
== aarch64_attr_enum
;
14582 if (attr_need_arg_p
^ (arg
!= NULL
))
14584 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
14588 /* If the name matches but the attribute does not allow "no-" versions
14589 then we can't match. */
14590 if (invert
&& !p_attr
->allow_neg
)
14592 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
14596 switch (p_attr
->attr_type
)
14598 /* Has a custom handler registered.
14599 For example, cpu=, arch=, tune=. */
14600 case aarch64_attr_custom
:
14601 gcc_assert (p_attr
->handler
);
14602 if (!p_attr
->handler (arg
))
14606 /* Either set or unset a boolean option. */
14607 case aarch64_attr_bool
:
14609 struct cl_decoded_option decoded
;
14611 generate_option (p_attr
->opt_num
, NULL
, !invert
,
14612 CL_TARGET
, &decoded
);
14613 aarch64_handle_option (&global_options
, &global_options_set
,
14614 &decoded
, input_location
);
14617 /* Set or unset a bit in the target_flags. aarch64_handle_option
14618 should know what mask to apply given the option number. */
14619 case aarch64_attr_mask
:
14621 struct cl_decoded_option decoded
;
14622 /* We only need to specify the option number.
14623 aarch64_handle_option will know which mask to apply. */
14624 decoded
.opt_index
= p_attr
->opt_num
;
14625 decoded
.value
= !invert
;
14626 aarch64_handle_option (&global_options
, &global_options_set
,
14627 &decoded
, input_location
);
14630 /* Use the option setting machinery to set an option to an enum. */
14631 case aarch64_attr_enum
:
14636 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
14637 &value
, CL_TARGET
);
14640 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
14641 NULL
, DK_UNSPECIFIED
, input_location
,
14646 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
14651 gcc_unreachable ();
14655 /* If we reached here we either have found an attribute and validated
14656 it or didn't match any. If we matched an attribute but its arguments
14657 were malformed we will have returned false already. */
14661 /* Count how many times the character C appears in
14662 NULL-terminated string STR. */
14664 static unsigned int
14665 num_occurences_in_str (char c
, char *str
)
14667 unsigned int res
= 0;
14668 while (*str
!= '\0')
14679 /* Parse the tree in ARGS that contains the target attribute information
14680 and update the global target options space. */
14683 aarch64_process_target_attr (tree args
)
14685 if (TREE_CODE (args
) == TREE_LIST
)
14689 tree head
= TREE_VALUE (args
);
14692 if (!aarch64_process_target_attr (head
))
14695 args
= TREE_CHAIN (args
);
14701 if (TREE_CODE (args
) != STRING_CST
)
14703 error ("attribute %<target%> argument not a string");
14707 size_t len
= strlen (TREE_STRING_POINTER (args
));
14708 char *str_to_check
= (char *) alloca (len
+ 1);
14709 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
14713 error ("malformed %<target()%> pragma or attribute");
14717 /* Used to catch empty spaces between commas i.e.
14718 attribute ((target ("attr1,,attr2"))). */
14719 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
14721 /* Handle multiple target attributes separated by ','. */
14722 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
14724 unsigned int num_attrs
= 0;
14728 if (!aarch64_process_one_target_attr (token
))
14730 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
14734 token
= strtok_r (NULL
, ",", &str_to_check
);
14737 if (num_attrs
!= num_commas
+ 1)
14739 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
14746 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14747 process attribute ((target ("..."))). */
14750 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
14752 struct cl_target_option cur_target
;
14755 tree new_target
, new_optimize
;
14756 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
14758 /* If what we're processing is the current pragma string then the
14759 target option node is already stored in target_option_current_node
14760 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14761 having to re-parse the string. This is especially useful to keep
14762 arm_neon.h compile times down since that header contains a lot
14763 of intrinsics enclosed in pragmas. */
14764 if (!existing_target
&& args
== current_target_pragma
)
14766 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
14769 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
14771 old_optimize
= build_optimization_node (&global_options
);
14772 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
14774 /* If the function changed the optimization levels as well as setting
14775 target options, start with the optimizations specified. */
14776 if (func_optimize
&& func_optimize
!= old_optimize
)
14777 cl_optimization_restore (&global_options
,
14778 TREE_OPTIMIZATION (func_optimize
));
14780 /* Save the current target options to restore at the end. */
14781 cl_target_option_save (&cur_target
, &global_options
);
14783 /* If fndecl already has some target attributes applied to it, unpack
14784 them so that we add this attribute on top of them, rather than
14785 overwriting them. */
14786 if (existing_target
)
14788 struct cl_target_option
*existing_options
14789 = TREE_TARGET_OPTION (existing_target
);
14791 if (existing_options
)
14792 cl_target_option_restore (&global_options
, existing_options
);
14795 cl_target_option_restore (&global_options
,
14796 TREE_TARGET_OPTION (target_option_current_node
));
14798 ret
= aarch64_process_target_attr (args
);
14800 /* Set up any additional state. */
14803 aarch64_override_options_internal (&global_options
);
14804 /* Initialize SIMD builtins if we haven't already.
14805 Set current_target_pragma to NULL for the duration so that
14806 the builtin initialization code doesn't try to tag the functions
14807 being built with the attributes specified by any current pragma, thus
14808 going into an infinite recursion. */
14811 tree saved_current_target_pragma
= current_target_pragma
;
14812 current_target_pragma
= NULL
;
14813 aarch64_init_simd_builtins ();
14814 current_target_pragma
= saved_current_target_pragma
;
14816 new_target
= build_target_option_node (&global_options
);
14821 new_optimize
= build_optimization_node (&global_options
);
14825 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
14827 if (old_optimize
!= new_optimize
)
14828 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
14831 cl_target_option_restore (&global_options
, &cur_target
);
14833 if (old_optimize
!= new_optimize
)
14834 cl_optimization_restore (&global_options
,
14835 TREE_OPTIMIZATION (old_optimize
));
14839 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14840 tri-bool options (yes, no, don't care) and the default value is
14841 DEF, determine whether to reject inlining. */
14844 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
14845 int dont_care
, int def
)
14847 /* If the callee doesn't care, always allow inlining. */
14848 if (callee
== dont_care
)
14851 /* If the caller doesn't care, always allow inlining. */
14852 if (caller
== dont_care
)
14855 /* Otherwise, allow inlining if either the callee and caller values
14856 agree, or if the callee is using the default value. */
14857 return (callee
== caller
|| callee
== def
);
14860 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14861 to inline CALLEE into CALLER based on target-specific info.
14862 Make sure that the caller and callee have compatible architectural
14863 features. Then go through the other possible target attributes
14864 and see if they can block inlining. Try not to reject always_inline
14865 callees unless they are incompatible architecturally. */
14868 aarch64_can_inline_p (tree caller
, tree callee
)
14870 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
14871 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
14873 struct cl_target_option
*caller_opts
14874 = TREE_TARGET_OPTION (caller_tree
? caller_tree
14875 : target_option_default_node
);
14877 struct cl_target_option
*callee_opts
14878 = TREE_TARGET_OPTION (callee_tree
? callee_tree
14879 : target_option_default_node
);
14881 /* Callee's ISA flags should be a subset of the caller's. */
14882 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
14883 != callee_opts
->x_aarch64_isa_flags
)
14886 /* Allow non-strict aligned functions inlining into strict
14888 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
14889 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
14890 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
14891 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
14894 bool always_inline
= lookup_attribute ("always_inline",
14895 DECL_ATTRIBUTES (callee
));
14897 /* If the architectural features match up and the callee is always_inline
14898 then the other attributes don't matter. */
14902 if (caller_opts
->x_aarch64_cmodel_var
14903 != callee_opts
->x_aarch64_cmodel_var
)
14906 if (caller_opts
->x_aarch64_tls_dialect
14907 != callee_opts
->x_aarch64_tls_dialect
)
14910 /* Honour explicit requests to workaround errata. */
14911 if (!aarch64_tribools_ok_for_inlining_p (
14912 caller_opts
->x_aarch64_fix_a53_err835769
,
14913 callee_opts
->x_aarch64_fix_a53_err835769
,
14914 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
14917 if (!aarch64_tribools_ok_for_inlining_p (
14918 caller_opts
->x_aarch64_fix_a53_err843419
,
14919 callee_opts
->x_aarch64_fix_a53_err843419
,
14920 2, TARGET_FIX_ERR_A53_843419
))
14923 /* If the user explicitly specified -momit-leaf-frame-pointer for the
14924 caller and calle and they don't match up, reject inlining. */
14925 if (!aarch64_tribools_ok_for_inlining_p (
14926 caller_opts
->x_flag_omit_leaf_frame_pointer
,
14927 callee_opts
->x_flag_omit_leaf_frame_pointer
,
14931 /* If the callee has specific tuning overrides, respect them. */
14932 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
14933 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
14936 /* If the user specified tuning override strings for the
14937 caller and callee and they don't match up, reject inlining.
14938 We just do a string compare here, we don't analyze the meaning
14939 of the string, as it would be too costly for little gain. */
14940 if (callee_opts
->x_aarch64_override_tune_string
14941 && caller_opts
->x_aarch64_override_tune_string
14942 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
14943 caller_opts
->x_aarch64_override_tune_string
) != 0))
14949 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14953 aarch64_tlsdesc_abi_id ()
14955 predefined_function_abi
&tlsdesc_abi
= function_abis
[ARM_PCS_TLSDESC
];
14956 if (!tlsdesc_abi
.initialized_p ())
14958 HARD_REG_SET full_reg_clobbers
;
14959 CLEAR_HARD_REG_SET (full_reg_clobbers
);
14960 SET_HARD_REG_BIT (full_reg_clobbers
, R0_REGNUM
);
14961 SET_HARD_REG_BIT (full_reg_clobbers
, CC_REGNUM
);
14962 for (int regno
= P0_REGNUM
; regno
<= P15_REGNUM
; ++regno
)
14963 SET_HARD_REG_BIT (full_reg_clobbers
, regno
);
14964 tlsdesc_abi
.initialize (ARM_PCS_TLSDESC
, full_reg_clobbers
);
14966 return tlsdesc_abi
.id ();
14969 /* Return true if SYMBOL_REF X binds locally. */
14972 aarch64_symbol_binds_local_p (const_rtx x
)
14974 return (SYMBOL_REF_DECL (x
)
14975 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
14976 : SYMBOL_REF_LOCAL_P (x
));
14979 /* Return true if SYMBOL_REF X is thread local */
14981 aarch64_tls_symbol_p (rtx x
)
14983 if (! TARGET_HAVE_TLS
)
14986 if (GET_CODE (x
) != SYMBOL_REF
)
14989 return SYMBOL_REF_TLS_MODEL (x
) != 0;
14992 /* Classify a TLS symbol into one of the TLS kinds. */
14993 enum aarch64_symbol_type
14994 aarch64_classify_tls_symbol (rtx x
)
14996 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
15000 case TLS_MODEL_GLOBAL_DYNAMIC
:
15001 case TLS_MODEL_LOCAL_DYNAMIC
:
15002 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
15004 case TLS_MODEL_INITIAL_EXEC
:
15005 switch (aarch64_cmodel
)
15007 case AARCH64_CMODEL_TINY
:
15008 case AARCH64_CMODEL_TINY_PIC
:
15009 return SYMBOL_TINY_TLSIE
;
15011 return SYMBOL_SMALL_TLSIE
;
15014 case TLS_MODEL_LOCAL_EXEC
:
15015 if (aarch64_tls_size
== 12)
15016 return SYMBOL_TLSLE12
;
15017 else if (aarch64_tls_size
== 24)
15018 return SYMBOL_TLSLE24
;
15019 else if (aarch64_tls_size
== 32)
15020 return SYMBOL_TLSLE32
;
15021 else if (aarch64_tls_size
== 48)
15022 return SYMBOL_TLSLE48
;
15024 gcc_unreachable ();
15026 case TLS_MODEL_EMULATED
:
15027 case TLS_MODEL_NONE
:
15028 return SYMBOL_FORCE_TO_MEM
;
15031 gcc_unreachable ();
15035 /* Return the correct method for accessing X + OFFSET, where X is either
15036 a SYMBOL_REF or LABEL_REF. */
15038 enum aarch64_symbol_type
15039 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
15041 if (GET_CODE (x
) == LABEL_REF
)
15043 switch (aarch64_cmodel
)
15045 case AARCH64_CMODEL_LARGE
:
15046 return SYMBOL_FORCE_TO_MEM
;
15048 case AARCH64_CMODEL_TINY_PIC
:
15049 case AARCH64_CMODEL_TINY
:
15050 return SYMBOL_TINY_ABSOLUTE
;
15052 case AARCH64_CMODEL_SMALL_SPIC
:
15053 case AARCH64_CMODEL_SMALL_PIC
:
15054 case AARCH64_CMODEL_SMALL
:
15055 return SYMBOL_SMALL_ABSOLUTE
;
15058 gcc_unreachable ();
15062 if (GET_CODE (x
) == SYMBOL_REF
)
15064 if (aarch64_tls_symbol_p (x
))
15065 return aarch64_classify_tls_symbol (x
);
15067 switch (aarch64_cmodel
)
15069 case AARCH64_CMODEL_TINY
:
15070 /* When we retrieve symbol + offset address, we have to make sure
15071 the offset does not cause overflow of the final address. But
15072 we have no way of knowing the address of symbol at compile time
15073 so we can't accurately say if the distance between the PC and
15074 symbol + offset is outside the addressible range of +/-1MB in the
15075 TINY code model. So we limit the maximum offset to +/-64KB and
15076 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15077 If offset_within_block_p is true we allow larger offsets.
15078 Furthermore force to memory if the symbol is a weak reference to
15079 something that doesn't resolve to a symbol in this module. */
15081 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15082 return SYMBOL_FORCE_TO_MEM
;
15083 if (!(IN_RANGE (offset
, -0x10000, 0x10000)
15084 || offset_within_block_p (x
, offset
)))
15085 return SYMBOL_FORCE_TO_MEM
;
15087 return SYMBOL_TINY_ABSOLUTE
;
15089 case AARCH64_CMODEL_SMALL
:
15090 /* Same reasoning as the tiny code model, but the offset cap here is
15091 1MB, allowing +/-3.9GB for the offset to the symbol. */
15093 if (SYMBOL_REF_WEAK (x
) && !aarch64_symbol_binds_local_p (x
))
15094 return SYMBOL_FORCE_TO_MEM
;
15095 if (!(IN_RANGE (offset
, -0x100000, 0x100000)
15096 || offset_within_block_p (x
, offset
)))
15097 return SYMBOL_FORCE_TO_MEM
;
15099 return SYMBOL_SMALL_ABSOLUTE
;
15101 case AARCH64_CMODEL_TINY_PIC
:
15102 if (!aarch64_symbol_binds_local_p (x
))
15103 return SYMBOL_TINY_GOT
;
15104 return SYMBOL_TINY_ABSOLUTE
;
15106 case AARCH64_CMODEL_SMALL_SPIC
:
15107 case AARCH64_CMODEL_SMALL_PIC
:
15108 if (!aarch64_symbol_binds_local_p (x
))
15109 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
15110 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
15111 return SYMBOL_SMALL_ABSOLUTE
;
15113 case AARCH64_CMODEL_LARGE
:
15114 /* This is alright even in PIC code as the constant
15115 pool reference is always PC relative and within
15116 the same translation unit. */
15117 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
15118 return SYMBOL_SMALL_ABSOLUTE
;
15120 return SYMBOL_FORCE_TO_MEM
;
15123 gcc_unreachable ();
15127 /* By default push everything into the constant pool. */
15128 return SYMBOL_FORCE_TO_MEM
;
15132 aarch64_constant_address_p (rtx x
)
15134 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
15138 aarch64_legitimate_pic_operand_p (rtx x
)
15140 if (GET_CODE (x
) == SYMBOL_REF
15141 || (GET_CODE (x
) == CONST
15142 && GET_CODE (XEXP (x
, 0)) == PLUS
15143 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
15149 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15150 that should be rematerialized rather than spilled. */
15153 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
15155 /* Support CSE and rematerialization of common constants. */
15156 if (CONST_INT_P (x
)
15157 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
15158 || GET_CODE (x
) == CONST_VECTOR
)
15161 /* Do not allow vector struct mode constants for Advanced SIMD.
15162 We could support 0 and -1 easily, but they need support in
15163 aarch64-simd.md. */
15164 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15165 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15168 /* Only accept variable-length vector constants if they can be
15171 ??? It would be possible to handle rematerialization of other
15172 constants via secondary reloads. */
15173 if (vec_flags
& VEC_ANY_SVE
)
15174 return aarch64_simd_valid_immediate (x
, NULL
);
15176 if (GET_CODE (x
) == HIGH
)
15179 /* Accept polynomial constants that can be calculated by using the
15180 destination of a move as the sole temporary. Constants that
15181 require a second temporary cannot be rematerialized (they can't be
15182 forced to memory and also aren't legitimate constants). */
15184 if (poly_int_rtx_p (x
, &offset
))
15185 return aarch64_offset_temporaries (false, offset
) <= 1;
15187 /* If an offset is being added to something else, we need to allow the
15188 base to be moved into the destination register, meaning that there
15189 are no free temporaries for the offset. */
15190 x
= strip_offset (x
, &offset
);
15191 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
15194 /* Do not allow const (plus (anchor_symbol, const_int)). */
15195 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
15198 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15199 so spilling them is better than rematerialization. */
15200 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
15203 /* Label references are always constant. */
15204 if (GET_CODE (x
) == LABEL_REF
)
15211 aarch64_load_tp (rtx target
)
15214 || GET_MODE (target
) != Pmode
15215 || !register_operand (target
, Pmode
))
15216 target
= gen_reg_rtx (Pmode
);
15218 /* Can return in any reg. */
15219 emit_insn (gen_aarch64_load_tp_hard (target
));
15223 /* On AAPCS systems, this is the "struct __va_list". */
15224 static GTY(()) tree va_list_type
;
15226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15227 Return the type to use as __builtin_va_list.
15229 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15241 aarch64_build_builtin_va_list (void)
15244 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15246 /* Create the type. */
15247 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
15248 /* Give it the required name. */
15249 va_list_name
= build_decl (BUILTINS_LOCATION
,
15251 get_identifier ("__va_list"),
15253 DECL_ARTIFICIAL (va_list_name
) = 1;
15254 TYPE_NAME (va_list_type
) = va_list_name
;
15255 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
15257 /* Create the fields. */
15258 f_stack
= build_decl (BUILTINS_LOCATION
,
15259 FIELD_DECL
, get_identifier ("__stack"),
15261 f_grtop
= build_decl (BUILTINS_LOCATION
,
15262 FIELD_DECL
, get_identifier ("__gr_top"),
15264 f_vrtop
= build_decl (BUILTINS_LOCATION
,
15265 FIELD_DECL
, get_identifier ("__vr_top"),
15267 f_groff
= build_decl (BUILTINS_LOCATION
,
15268 FIELD_DECL
, get_identifier ("__gr_offs"),
15269 integer_type_node
);
15270 f_vroff
= build_decl (BUILTINS_LOCATION
,
15271 FIELD_DECL
, get_identifier ("__vr_offs"),
15272 integer_type_node
);
15274 /* Tell tree-stdarg pass about our internal offset fields.
15275 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15276 purpose to identify whether the code is updating va_list internal
15277 offset fields through irregular way. */
15278 va_list_gpr_counter_field
= f_groff
;
15279 va_list_fpr_counter_field
= f_vroff
;
15281 DECL_ARTIFICIAL (f_stack
) = 1;
15282 DECL_ARTIFICIAL (f_grtop
) = 1;
15283 DECL_ARTIFICIAL (f_vrtop
) = 1;
15284 DECL_ARTIFICIAL (f_groff
) = 1;
15285 DECL_ARTIFICIAL (f_vroff
) = 1;
15287 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
15288 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
15289 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
15290 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
15291 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
15293 TYPE_FIELDS (va_list_type
) = f_stack
;
15294 DECL_CHAIN (f_stack
) = f_grtop
;
15295 DECL_CHAIN (f_grtop
) = f_vrtop
;
15296 DECL_CHAIN (f_vrtop
) = f_groff
;
15297 DECL_CHAIN (f_groff
) = f_vroff
;
15299 /* Compute its layout. */
15300 layout_type (va_list_type
);
15302 return va_list_type
;
15305 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15307 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
15309 const CUMULATIVE_ARGS
*cum
;
15310 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15311 tree stack
, grtop
, vrtop
, groff
, vroff
;
15313 int gr_save_area_size
= cfun
->va_list_gpr_size
;
15314 int vr_save_area_size
= cfun
->va_list_fpr_size
;
15317 cum
= &crtl
->args
.info
;
15318 if (cfun
->va_list_gpr_size
)
15319 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
15320 cfun
->va_list_gpr_size
);
15321 if (cfun
->va_list_fpr_size
)
15322 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
15323 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
15327 gcc_assert (cum
->aapcs_nvrn
== 0);
15328 vr_save_area_size
= 0;
15331 f_stack
= TYPE_FIELDS (va_list_type_node
);
15332 f_grtop
= DECL_CHAIN (f_stack
);
15333 f_vrtop
= DECL_CHAIN (f_grtop
);
15334 f_groff
= DECL_CHAIN (f_vrtop
);
15335 f_vroff
= DECL_CHAIN (f_groff
);
15337 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
15339 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
15341 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
15343 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
15345 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
15348 /* Emit code to initialize STACK, which points to the next varargs stack
15349 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15350 by named arguments. STACK is 8-byte aligned. */
15351 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
15352 if (cum
->aapcs_stack_size
> 0)
15353 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
15354 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
15355 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15357 /* Emit code to initialize GRTOP, the top of the GR save area.
15358 virtual_incoming_args_rtx should have been 16 byte aligned. */
15359 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
15360 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
15361 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15363 /* Emit code to initialize VRTOP, the top of the VR save area.
15364 This address is gr_save_area_bytes below GRTOP, rounded
15365 down to the next 16-byte boundary. */
15366 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
15367 vr_offset
= ROUND_UP (gr_save_area_size
,
15368 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15371 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
15372 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
15373 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15375 /* Emit code to initialize GROFF, the offset from GRTOP of the
15376 next GPR argument. */
15377 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
15378 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
15379 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15381 /* Likewise emit code to initialize VROFF, the offset from FTOP
15382 of the next VR argument. */
15383 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
15384 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
15385 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
15388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15391 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
15392 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
15396 bool is_ha
; /* is HFA or HVA. */
15397 bool dw_align
; /* double-word align. */
15398 machine_mode ag_mode
= VOIDmode
;
15402 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
15403 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
15404 HOST_WIDE_INT size
, rsize
, adjust
, align
;
15405 tree t
, u
, cond1
, cond2
;
15407 indirect_p
= pass_va_arg_by_reference (type
);
15409 type
= build_pointer_type (type
);
15411 mode
= TYPE_MODE (type
);
15413 f_stack
= TYPE_FIELDS (va_list_type_node
);
15414 f_grtop
= DECL_CHAIN (f_stack
);
15415 f_vrtop
= DECL_CHAIN (f_grtop
);
15416 f_groff
= DECL_CHAIN (f_vrtop
);
15417 f_vroff
= DECL_CHAIN (f_groff
);
15419 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
15420 f_stack
, NULL_TREE
);
15421 size
= int_size_in_bytes (type
);
15425 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
15429 if (aarch64_vfp_is_call_or_return_candidate (mode
,
15435 /* No frontends can create types with variable-sized modes, so we
15436 shouldn't be asked to pass or return them. */
15437 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
15439 /* TYPE passed in fp/simd registers. */
15441 aarch64_err_no_fpadvsimd (mode
);
15443 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
15444 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
15445 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
15446 unshare_expr (valist
), f_vroff
, NULL_TREE
);
15448 rsize
= nregs
* UNITS_PER_VREG
;
15452 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
15453 adjust
= UNITS_PER_VREG
- ag_size
;
15455 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15456 && size
< UNITS_PER_VREG
)
15458 adjust
= UNITS_PER_VREG
- size
;
15463 /* TYPE passed in general registers. */
15464 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
15465 unshare_expr (valist
), f_grtop
, NULL_TREE
);
15466 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
15467 unshare_expr (valist
), f_groff
, NULL_TREE
);
15468 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
15469 nregs
= rsize
/ UNITS_PER_WORD
;
15473 if (abi_break
&& warn_psabi
)
15474 inform (input_location
, "parameter passing for argument of type "
15475 "%qT changed in GCC 9.1", type
);
15479 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15480 && size
< UNITS_PER_WORD
)
15482 adjust
= UNITS_PER_WORD
- size
;
15486 /* Get a local temporary for the field value. */
15487 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
15489 /* Emit code to branch if off >= 0. */
15490 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
15491 build_int_cst (TREE_TYPE (off
), 0));
15492 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
15496 /* Emit: offs = (offs + 15) & -16. */
15497 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
15498 build_int_cst (TREE_TYPE (off
), 15));
15499 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
15500 build_int_cst (TREE_TYPE (off
), -16));
15501 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
15506 /* Update ap.__[g|v]r_offs */
15507 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
15508 build_int_cst (TREE_TYPE (off
), rsize
));
15509 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
15513 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
15515 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15516 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
15517 build_int_cst (TREE_TYPE (f_off
), 0));
15518 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
15520 /* String up: make sure the assignment happens before the use. */
15521 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
15522 COND_EXPR_ELSE (cond1
) = t
;
15524 /* Prepare the trees handling the argument that is passed on the stack;
15525 the top level node will store in ON_STACK. */
15526 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
15529 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15530 t
= fold_build_pointer_plus_hwi (arg
, 15);
15531 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
15532 build_int_cst (TREE_TYPE (t
), -16));
15533 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
15537 /* Advance ap.__stack */
15538 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
15539 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
15540 build_int_cst (TREE_TYPE (t
), -8));
15541 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
15542 /* String up roundup and advance. */
15544 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
15545 /* String up with arg */
15546 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
15547 /* Big-endianness related address adjustment. */
15548 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
15549 && size
< UNITS_PER_WORD
)
15551 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
15552 size_int (UNITS_PER_WORD
- size
));
15553 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
15556 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
15557 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
15559 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15562 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
15563 build_int_cst (TREE_TYPE (off
), adjust
));
15565 t
= fold_convert (sizetype
, t
);
15566 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
15570 /* type ha; // treat as "struct {ftype field[n];}"
15571 ... [computing offs]
15572 for (i = 0; i <nregs; ++i, offs += 16)
15573 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15576 tree tmp_ha
, field_t
, field_ptr_t
;
15578 /* Declare a local variable. */
15579 tmp_ha
= create_tmp_var_raw (type
, "ha");
15580 gimple_add_tmp_var (tmp_ha
);
15582 /* Establish the base type. */
15586 field_t
= float_type_node
;
15587 field_ptr_t
= float_ptr_type_node
;
15590 field_t
= double_type_node
;
15591 field_ptr_t
= double_ptr_type_node
;
15594 field_t
= long_double_type_node
;
15595 field_ptr_t
= long_double_ptr_type_node
;
15598 field_t
= aarch64_fp16_type_node
;
15599 field_ptr_t
= aarch64_fp16_ptr_type_node
;
15602 field_t
= aarch64_bf16_type_node
;
15603 field_ptr_t
= aarch64_bf16_ptr_type_node
;
15608 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
15609 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
15610 field_ptr_t
= build_pointer_type (field_t
);
15617 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15618 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
15620 t
= fold_convert (field_ptr_t
, addr
);
15621 t
= build2 (MODIFY_EXPR
, field_t
,
15622 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
15623 build1 (INDIRECT_REF
, field_t
, t
));
15625 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15626 for (i
= 1; i
< nregs
; ++i
)
15628 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
15629 u
= fold_convert (field_ptr_t
, addr
);
15630 u
= build2 (MODIFY_EXPR
, field_t
,
15631 build2 (MEM_REF
, field_t
, tmp_ha
,
15632 build_int_cst (field_ptr_t
,
15634 int_size_in_bytes (field_t
)))),
15635 build1 (INDIRECT_REF
, field_t
, u
));
15636 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
15639 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
15640 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
15643 COND_EXPR_ELSE (cond2
) = t
;
15644 addr
= fold_convert (build_pointer_type (type
), cond1
);
15645 addr
= build_va_arg_indirect_ref (addr
);
15648 addr
= build_va_arg_indirect_ref (addr
);
15653 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15656 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
,
15657 const function_arg_info
&arg
,
15658 int *pretend_size ATTRIBUTE_UNUSED
, int no_rtl
)
15660 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
15661 CUMULATIVE_ARGS local_cum
;
15662 int gr_saved
= cfun
->va_list_gpr_size
;
15663 int vr_saved
= cfun
->va_list_fpr_size
;
15665 /* The caller has advanced CUM up to, but not beyond, the last named
15666 argument. Advance a local copy of CUM past the last "real" named
15667 argument, to find out how many registers are left over. */
15669 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), arg
);
15671 /* Found out how many registers we need to save.
15672 Honor tree-stdvar analysis results. */
15673 if (cfun
->va_list_gpr_size
)
15674 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
15675 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
15676 if (cfun
->va_list_fpr_size
)
15677 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
15678 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
15682 gcc_assert (local_cum
.aapcs_nvrn
== 0);
15692 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15693 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
15694 - gr_saved
* UNITS_PER_WORD
);
15695 mem
= gen_frame_mem (BLKmode
, ptr
);
15696 set_mem_alias_set (mem
, get_varargs_alias_set ());
15698 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
15703 /* We can't use move_block_from_reg, because it will use
15704 the wrong mode, storing D regs only. */
15705 machine_mode mode
= TImode
;
15706 int off
, i
, vr_start
;
15708 /* Set OFF to the offset from virtual_incoming_args_rtx of
15709 the first vector register. The VR save area lies below
15710 the GR one, and is aligned to 16 bytes. */
15711 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
15712 STACK_BOUNDARY
/ BITS_PER_UNIT
);
15713 off
-= vr_saved
* UNITS_PER_VREG
;
15715 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
15716 for (i
= 0; i
< vr_saved
; ++i
)
15720 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
15721 mem
= gen_frame_mem (mode
, ptr
);
15722 set_mem_alias_set (mem
, get_varargs_alias_set ());
15723 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
15724 off
+= UNITS_PER_VREG
;
15729 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15730 any complication of having crtl->args.pretend_args_size changed. */
15731 cfun
->machine
->frame
.saved_varargs_size
15732 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
15733 STACK_BOUNDARY
/ BITS_PER_UNIT
)
15734 + vr_saved
* UNITS_PER_VREG
);
15738 aarch64_conditional_register_usage (void)
15743 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
15746 call_used_regs
[i
] = 1;
15750 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
15753 call_used_regs
[i
] = 1;
15756 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15757 CLEAR_HARD_REG_BIT (operand_reg_set
, FFR_REGNUM
);
15758 CLEAR_HARD_REG_BIT (operand_reg_set
, FFRT_REGNUM
);
15760 /* When tracking speculation, we need a couple of call-clobbered registers
15761 to track the speculation state. It would be nice to just use
15762 IP0 and IP1, but currently there are numerous places that just
15763 assume these registers are free for other uses (eg pointer
15764 authentication). */
15765 if (aarch64_track_speculation
)
15767 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
15768 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
15769 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
15770 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
15774 /* Walk down the type tree of TYPE counting consecutive base elements.
15775 If *MODEP is VOIDmode, then set it to the first valid floating point
15776 type. If a non-floating point type is found, or if a floating point
15777 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15778 otherwise return the count in the sub-tree. */
15780 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
15783 HOST_WIDE_INT size
;
15785 /* SVE types (and types containing SVE types) must be handled
15786 before calling this function. */
15787 gcc_assert (!aarch64_sve::builtin_type_p (type
));
15789 switch (TREE_CODE (type
))
15792 mode
= TYPE_MODE (type
);
15793 if (mode
!= DFmode
&& mode
!= SFmode
15794 && mode
!= TFmode
&& mode
!= HFmode
)
15797 if (*modep
== VOIDmode
)
15800 if (*modep
== mode
)
15806 mode
= TYPE_MODE (TREE_TYPE (type
));
15807 if (mode
!= DFmode
&& mode
!= SFmode
15808 && mode
!= TFmode
&& mode
!= HFmode
)
15811 if (*modep
== VOIDmode
)
15814 if (*modep
== mode
)
15820 /* Use V2SImode and V4SImode as representatives of all 64-bit
15821 and 128-bit vector types. */
15822 size
= int_size_in_bytes (type
);
15835 if (*modep
== VOIDmode
)
15838 /* Vector modes are considered to be opaque: two vectors are
15839 equivalent for the purposes of being homogeneous aggregates
15840 if they are the same size. */
15841 if (*modep
== mode
)
15849 tree index
= TYPE_DOMAIN (type
);
15851 /* Can't handle incomplete types nor sizes that are not
15853 if (!COMPLETE_TYPE_P (type
)
15854 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15857 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
15860 || !TYPE_MAX_VALUE (index
)
15861 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
15862 || !TYPE_MIN_VALUE (index
)
15863 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
15867 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
15868 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
15870 /* There must be no padding. */
15871 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15872 count
* GET_MODE_BITSIZE (*modep
)))
15884 /* Can't handle incomplete types nor sizes that are not
15886 if (!COMPLETE_TYPE_P (type
)
15887 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15890 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
15892 if (TREE_CODE (field
) != FIELD_DECL
)
15895 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
15898 count
+= sub_count
;
15901 /* There must be no padding. */
15902 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15903 count
* GET_MODE_BITSIZE (*modep
)))
15910 case QUAL_UNION_TYPE
:
15912 /* These aren't very interesting except in a degenerate case. */
15917 /* Can't handle incomplete types nor sizes that are not
15919 if (!COMPLETE_TYPE_P (type
)
15920 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15923 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
15925 if (TREE_CODE (field
) != FIELD_DECL
)
15928 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
15931 count
= count
> sub_count
? count
: sub_count
;
15934 /* There must be no padding. */
15935 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
15936 count
* GET_MODE_BITSIZE (*modep
)))
15949 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15950 type as described in AAPCS64 \S 4.1.2.
15952 See the comment above aarch64_composite_type_p for the notes on MODE. */
15955 aarch64_short_vector_p (const_tree type
,
15958 poly_int64 size
= -1;
15960 if (type
&& aarch64_sve::builtin_type_p (type
))
15963 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
15964 size
= int_size_in_bytes (type
);
15965 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
15966 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
15967 size
= GET_MODE_SIZE (mode
);
15969 return known_eq (size
, 8) || known_eq (size
, 16);
15972 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15973 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
15974 array types. The C99 floating-point complex types are also considered
15975 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
15976 types, which are GCC extensions and out of the scope of AAPCS64, are
15977 treated as composite types here as well.
15979 Note that MODE itself is not sufficient in determining whether a type
15980 is such a composite type or not. This is because
15981 stor-layout.c:compute_record_mode may have already changed the MODE
15982 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
15983 structure with only one field may have its MODE set to the mode of the
15984 field. Also an integer mode whose size matches the size of the
15985 RECORD_TYPE type may be used to substitute the original mode
15986 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
15987 solely relied on. */
15990 aarch64_composite_type_p (const_tree type
,
15993 if (aarch64_short_vector_p (type
, mode
))
15996 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
15999 if (mode
== BLKmode
16000 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
16001 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
16007 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16008 shall be passed or returned in simd/fp register(s) (providing these
16009 parameter passing registers are available).
16011 Upon successful return, *COUNT returns the number of needed registers,
16012 *BASE_MODE returns the mode of the individual register and when IS_HAF
16013 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16014 floating-point aggregate or a homogeneous short-vector aggregate. */
16017 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
16019 machine_mode
*base_mode
,
16023 if (is_ha
!= NULL
) *is_ha
= false;
16025 if (type
&& aarch64_sve::builtin_type_p (type
))
16028 machine_mode new_mode
= VOIDmode
;
16029 bool composite_p
= aarch64_composite_type_p (type
, mode
);
16031 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
16032 || aarch64_short_vector_p (type
, mode
))
16037 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
16039 if (is_ha
!= NULL
) *is_ha
= true;
16041 new_mode
= GET_MODE_INNER (mode
);
16043 else if (type
&& composite_p
)
16045 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
16047 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
16049 if (is_ha
!= NULL
) *is_ha
= true;
16058 *base_mode
= new_mode
;
16062 /* Implement TARGET_STRUCT_VALUE_RTX. */
16065 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
16066 int incoming ATTRIBUTE_UNUSED
)
16068 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
16071 /* Implements target hook vector_mode_supported_p. */
16073 aarch64_vector_mode_supported_p (machine_mode mode
)
16075 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16076 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
16079 /* Return the full-width SVE vector mode for element mode MODE, if one
16082 aarch64_full_sve_mode (scalar_mode mode
)
16099 return VNx16QImode
;
16101 return opt_machine_mode ();
16105 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16108 aarch64_vq_mode (scalar_mode mode
)
16129 return opt_machine_mode ();
16133 /* Return appropriate SIMD container
16134 for MODE within a vector of WIDTH bits. */
16135 static machine_mode
16136 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
16139 && maybe_ne (width
, 128)
16140 && known_eq (width
, BITS_PER_SVE_VECTOR
))
16141 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
16143 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
16146 if (known_eq (width
, 128))
16147 return aarch64_vq_mode (mode
).else_mode (word_mode
);
16170 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16171 static machine_mode
16172 aarch64_preferred_simd_mode (scalar_mode mode
)
16174 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
16175 return aarch64_simd_container_mode (mode
, bits
);
16178 /* Return a list of possible vector sizes for the vectorizer
16179 to iterate over. */
16180 static unsigned int
16181 aarch64_autovectorize_vector_modes (vector_modes
*modes
, bool)
16183 static const machine_mode sve_modes
[] = {
16184 /* Try using full vectors for all element types. */
16187 /* Try using 16-bit containers for 8-bit elements and full vectors
16188 for wider elements. */
16191 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16192 full vectors for wider elements. */
16195 /* Try using 64-bit containers for all element types. */
16199 static const machine_mode advsimd_modes
[] = {
16200 /* Try using 128-bit vectors for all element types. */
16203 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16204 for wider elements. */
16207 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16208 for wider elements.
16210 TODO: We could support a limited form of V4QImode too, so that
16211 we use 32-bit vectors for 8-bit elements. */
16214 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16215 for 64-bit elements.
16217 TODO: We could similarly support limited forms of V2QImode and V2HImode
16222 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16225 - If we can't use N-byte Advanced SIMD vectors then the placement
16226 doesn't matter; we'll just continue as though the Advanced SIMD
16227 entry didn't exist.
16229 - If an SVE main loop with N bytes ends up being cheaper than an
16230 Advanced SIMD main loop with N bytes then by default we'll replace
16231 the Advanced SIMD version with the SVE one.
16233 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16234 than an SVE main loop with N bytes then by default we'll try to
16235 use the SVE loop to vectorize the epilogue instead. */
16236 unsigned int sve_i
= TARGET_SVE
? 0 : ARRAY_SIZE (sve_modes
);
16237 unsigned int advsimd_i
= 0;
16238 while (advsimd_i
< ARRAY_SIZE (advsimd_modes
))
16240 if (sve_i
< ARRAY_SIZE (sve_modes
)
16241 && maybe_gt (GET_MODE_NUNITS (sve_modes
[sve_i
]),
16242 GET_MODE_NUNITS (advsimd_modes
[advsimd_i
])))
16243 modes
->safe_push (sve_modes
[sve_i
++]);
16245 modes
->safe_push (advsimd_modes
[advsimd_i
++]);
16247 while (sve_i
< ARRAY_SIZE (sve_modes
))
16248 modes
->safe_push (sve_modes
[sve_i
++]);
16250 unsigned int flags
= 0;
16251 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16252 can compare SVE against Advanced SIMD and so that we can compare
16253 multiple SVE vectorization approaches against each other. There's
16254 not really any point doing this for Advanced SIMD only, since the
16255 first mode that works should always be the best. */
16256 if (TARGET_SVE
&& aarch64_sve_compare_costs
)
16257 flags
|= VECT_COMPARE_COSTS
;
16261 /* Implement TARGET_MANGLE_TYPE. */
16263 static const char *
16264 aarch64_mangle_type (const_tree type
)
16266 /* The AArch64 ABI documents say that "__va_list" has to be
16267 mangled as if it is in the "std" namespace. */
16268 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
16269 return "St9__va_list";
16271 /* Half-precision floating point types. */
16272 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
16274 if (TYPE_MODE (type
) == BFmode
)
16280 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16282 if (TYPE_NAME (type
) != NULL
)
16285 if ((res
= aarch64_general_mangle_builtin_type (type
))
16286 || (res
= aarch64_sve::mangle_builtin_type (type
)))
16290 /* Use the default mangling. */
16294 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16297 aarch64_verify_type_context (location_t loc
, type_context_kind context
,
16298 const_tree type
, bool silent_p
)
16300 return aarch64_sve::verify_type_context (loc
, context
, type
, silent_p
);
16303 /* Find the first rtx_insn before insn that will generate an assembly
16307 aarch64_prev_real_insn (rtx_insn
*insn
)
16314 insn
= prev_real_insn (insn
);
16316 while (insn
&& recog_memoized (insn
) < 0);
16322 is_madd_op (enum attr_type t1
)
16325 /* A number of these may be AArch32 only. */
16326 enum attr_type mlatypes
[] = {
16327 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
16328 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
16329 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
16332 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
16334 if (t1
== mlatypes
[i
])
16341 /* Check if there is a register dependency between a load and the insn
16342 for which we hold recog_data. */
16345 dep_between_memop_and_curr (rtx memop
)
16350 gcc_assert (GET_CODE (memop
) == SET
);
16352 if (!REG_P (SET_DEST (memop
)))
16355 load_reg
= SET_DEST (memop
);
16356 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
16358 rtx operand
= recog_data
.operand
[opno
];
16359 if (REG_P (operand
)
16360 && reg_overlap_mentioned_p (load_reg
, operand
))
16368 /* When working around the Cortex-A53 erratum 835769,
16369 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16370 instruction and has a preceding memory instruction such that a NOP
16371 should be inserted between them. */
16374 aarch64_madd_needs_nop (rtx_insn
* insn
)
16376 enum attr_type attr_type
;
16380 if (!TARGET_FIX_ERR_A53_835769
)
16383 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
16386 attr_type
= get_attr_type (insn
);
16387 if (!is_madd_op (attr_type
))
16390 prev
= aarch64_prev_real_insn (insn
);
16391 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16392 Restore recog state to INSN to avoid state corruption. */
16393 extract_constrain_insn_cached (insn
);
16395 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
16398 body
= single_set (prev
);
16400 /* If the previous insn is a memory op and there is no dependency between
16401 it and the DImode madd, emit a NOP between them. If body is NULL then we
16402 have a complex memory operation, probably a load/store pair.
16403 Be conservative for now and emit a NOP. */
16404 if (GET_MODE (recog_data
.operand
[0]) == DImode
16405 && (!body
|| !dep_between_memop_and_curr (body
)))
16413 /* Implement FINAL_PRESCAN_INSN. */
16416 aarch64_final_prescan_insn (rtx_insn
*insn
)
16418 if (aarch64_madd_needs_nop (insn
))
16419 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
16423 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16427 aarch64_sve_index_immediate_p (rtx base_or_step
)
16429 return (CONST_INT_P (base_or_step
)
16430 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
16433 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
16434 when applied to mode MODE. Negate X first if NEGATE_P is true. */
16437 aarch64_sve_arith_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
16439 rtx elt
= unwrap_const_vec_duplicate (x
);
16440 if (!CONST_INT_P (elt
))
16443 HOST_WIDE_INT val
= INTVAL (elt
);
16446 val
&= GET_MODE_MASK (GET_MODE_INNER (mode
));
16449 return IN_RANGE (val
, 0, 0xff);
16450 return IN_RANGE (val
, 0, 0xff00);
16453 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16454 instructions when applied to mode MODE. Negate X first if NEGATE_P
16458 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode
, rtx x
, bool negate_p
)
16460 if (!aarch64_sve_arith_immediate_p (mode
, x
, negate_p
))
16463 /* After the optional negation, the immediate must be nonnegative.
16464 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16465 instead of SQADD Zn.B, Zn.B, #129. */
16466 rtx elt
= unwrap_const_vec_duplicate (x
);
16467 return negate_p
== (INTVAL (elt
) < 0);
16470 /* Return true if X is a valid immediate operand for an SVE logical
16471 instruction such as AND. */
16474 aarch64_sve_bitmask_immediate_p (rtx x
)
16478 return (const_vec_duplicate_p (x
, &elt
)
16479 && CONST_INT_P (elt
)
16480 && aarch64_bitmask_imm (INTVAL (elt
),
16481 GET_MODE_INNER (GET_MODE (x
))));
16484 /* Return true if X is a valid immediate for the SVE DUP and CPY
16488 aarch64_sve_dup_immediate_p (rtx x
)
16490 x
= aarch64_bit_representation (unwrap_const_vec_duplicate (x
));
16491 if (!CONST_INT_P (x
))
16494 HOST_WIDE_INT val
= INTVAL (x
);
16496 return IN_RANGE (val
, -0x80, 0x7f);
16497 return IN_RANGE (val
, -0x8000, 0x7f00);
16500 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16501 SIGNED_P says whether the operand is signed rather than unsigned. */
16504 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
16506 x
= unwrap_const_vec_duplicate (x
);
16507 return (CONST_INT_P (x
)
16509 ? IN_RANGE (INTVAL (x
), -16, 15)
16510 : IN_RANGE (INTVAL (x
), 0, 127)));
16513 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16514 instruction. Negate X first if NEGATE_P is true. */
16517 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
16522 if (!const_vec_duplicate_p (x
, &elt
)
16523 || GET_CODE (elt
) != CONST_DOUBLE
)
16526 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
16529 r
= real_value_negate (&r
);
16531 if (real_equal (&r
, &dconst1
))
16533 if (real_equal (&r
, &dconsthalf
))
16538 /* Return true if X is a valid immediate operand for an SVE FMUL
16542 aarch64_sve_float_mul_immediate_p (rtx x
)
16546 return (const_vec_duplicate_p (x
, &elt
)
16547 && GET_CODE (elt
) == CONST_DOUBLE
16548 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
)
16549 || real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconst2
)));
16552 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16553 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16554 is nonnull, use it to describe valid immediates. */
16556 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
16557 simd_immediate_info
*info
,
16558 enum simd_immediate_check which
,
16559 simd_immediate_info::insn_type insn
)
16561 /* Try a 4-byte immediate with LSL. */
16562 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
16563 if ((val32
& (0xff << shift
)) == val32
)
16566 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
16567 simd_immediate_info::LSL
, shift
);
16571 /* Try a 2-byte immediate with LSL. */
16572 unsigned int imm16
= val32
& 0xffff;
16573 if (imm16
== (val32
>> 16))
16574 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
16575 if ((imm16
& (0xff << shift
)) == imm16
)
16578 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
16579 simd_immediate_info::LSL
, shift
);
16583 /* Try a 4-byte immediate with MSL, except for cases that MVN
16585 if (which
== AARCH64_CHECK_MOV
)
16586 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
16588 unsigned int low
= (1 << shift
) - 1;
16589 if (((val32
& (0xff << shift
)) | low
) == val32
)
16592 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
16593 simd_immediate_info::MSL
, shift
);
16601 /* Return true if replicating VAL64 is a valid immediate for the
16602 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16603 use it to describe valid immediates. */
16605 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
16606 simd_immediate_info
*info
,
16607 enum simd_immediate_check which
)
16609 unsigned int val32
= val64
& 0xffffffff;
16610 unsigned int val16
= val64
& 0xffff;
16611 unsigned int val8
= val64
& 0xff;
16613 if (val32
== (val64
>> 32))
16615 if ((which
& AARCH64_CHECK_ORR
) != 0
16616 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
16617 simd_immediate_info::MOV
))
16620 if ((which
& AARCH64_CHECK_BIC
) != 0
16621 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
16622 simd_immediate_info::MVN
))
16625 /* Try using a replicated byte. */
16626 if (which
== AARCH64_CHECK_MOV
16627 && val16
== (val32
>> 16)
16628 && val8
== (val16
>> 8))
16631 *info
= simd_immediate_info (QImode
, val8
);
16636 /* Try using a bit-to-bytemask. */
16637 if (which
== AARCH64_CHECK_MOV
)
16640 for (i
= 0; i
< 64; i
+= 8)
16642 unsigned char byte
= (val64
>> i
) & 0xff;
16643 if (byte
!= 0 && byte
!= 0xff)
16649 *info
= simd_immediate_info (DImode
, val64
);
16656 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16657 instruction. If INFO is nonnull, use it to describe valid immediates. */
16660 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
16661 simd_immediate_info
*info
)
16663 scalar_int_mode mode
= DImode
;
16664 unsigned int val32
= val64
& 0xffffffff;
16665 if (val32
== (val64
>> 32))
16668 unsigned int val16
= val32
& 0xffff;
16669 if (val16
== (val32
>> 16))
16672 unsigned int val8
= val16
& 0xff;
16673 if (val8
== (val16
>> 8))
16677 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
16678 if (IN_RANGE (val
, -0x80, 0x7f))
16680 /* DUP with no shift. */
16682 *info
= simd_immediate_info (mode
, val
);
16685 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
16687 /* DUP with LSL #8. */
16689 *info
= simd_immediate_info (mode
, val
);
16692 if (aarch64_bitmask_imm (val64
, mode
))
16696 *info
= simd_immediate_info (mode
, val
);
16702 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16704 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16706 where PATTERN is the svpattern as a CONST_INT and where ZERO
16707 is a zero constant of the required PTRUE mode (which can have
16708 fewer elements than X's mode, if zero bits are significant).
16710 If so, and if INFO is nonnull, describe the immediate in INFO. */
16712 aarch64_sve_ptrue_svpattern_p (rtx x
, struct simd_immediate_info
*info
)
16714 if (GET_CODE (x
) != CONST
)
16718 if (GET_CODE (x
) != UNSPEC
|| XINT (x
, 1) != UNSPEC_PTRUE
)
16723 aarch64_svpattern pattern
16724 = (aarch64_svpattern
) INTVAL (XVECEXP (x
, 0, 0));
16725 machine_mode pred_mode
= GET_MODE (XVECEXP (x
, 0, 1));
16726 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (pred_mode
);
16727 *info
= simd_immediate_info (int_mode
, pattern
);
16732 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16733 it to describe valid immediates. */
16736 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
16738 if (aarch64_sve_ptrue_svpattern_p (x
, info
))
16741 if (x
== CONST0_RTX (GET_MODE (x
)))
16744 *info
= simd_immediate_info (DImode
, 0);
16748 /* Analyze the value as a VNx16BImode. This should be relatively
16749 efficient, since rtx_vector_builder has enough built-in capacity
16750 to store all VLA predicate constants without needing the heap. */
16751 rtx_vector_builder builder
;
16752 if (!aarch64_get_sve_pred_bits (builder
, x
))
16755 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
16756 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
16758 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
16759 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
16760 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
16764 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
16765 *info
= simd_immediate_info (int_mode
, pattern
);
16773 /* Return true if OP is a valid SIMD immediate for the operation
16774 described by WHICH. If INFO is nonnull, use it to describe valid
16777 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
16778 enum simd_immediate_check which
)
16780 machine_mode mode
= GET_MODE (op
);
16781 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
16782 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
16785 if (vec_flags
& VEC_SVE_PRED
)
16786 return aarch64_sve_pred_valid_immediate (op
, info
);
16788 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
16790 unsigned int n_elts
;
16791 if (GET_CODE (op
) == CONST_VECTOR
16792 && CONST_VECTOR_DUPLICATE_P (op
))
16793 n_elts
= CONST_VECTOR_NPATTERNS (op
);
16794 else if ((vec_flags
& VEC_SVE_DATA
)
16795 && const_vec_series_p (op
, &base
, &step
))
16797 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
16798 if (!aarch64_sve_index_immediate_p (base
)
16799 || !aarch64_sve_index_immediate_p (step
))
16804 /* Get the corresponding container mode. E.g. an INDEX on V2SI
16805 should yield two integer values per 128-bit block, meaning
16806 that we need to treat it in the same way as V2DI and then
16807 ignore the upper 32 bits of each element. */
16808 elt_mode
= aarch64_sve_container_int_mode (mode
);
16809 *info
= simd_immediate_info (elt_mode
, base
, step
);
16813 else if (GET_CODE (op
) == CONST_VECTOR
16814 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
16815 /* N_ELTS set above. */;
16819 scalar_float_mode elt_float_mode
;
16821 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
16823 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
16824 if (aarch64_float_const_zero_rtx_p (elt
)
16825 || aarch64_float_const_representable_p (elt
))
16828 *info
= simd_immediate_info (elt_float_mode
, elt
);
16833 /* If all elements in an SVE vector have the same value, we have a free
16834 choice between using the element mode and using the container mode.
16835 Using the element mode means that unused parts of the vector are
16836 duplicates of the used elements, while using the container mode means
16837 that the unused parts are an extension of the used elements. Using the
16838 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16839 for its container mode VNx4SI while 0x00000101 isn't.
16841 If not all elements in an SVE vector have the same value, we need the
16842 transition from one element to the next to occur at container boundaries.
16843 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16844 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
16845 scalar_int_mode elt_int_mode
;
16846 if ((vec_flags
& VEC_SVE_DATA
) && n_elts
> 1)
16847 elt_int_mode
= aarch64_sve_container_int_mode (mode
);
16849 elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
16851 unsigned int elt_size
= GET_MODE_SIZE (elt_int_mode
);
16855 /* Expand the vector constant out into a byte vector, with the least
16856 significant byte of the register first. */
16857 auto_vec
<unsigned char, 16> bytes
;
16858 bytes
.reserve (n_elts
* elt_size
);
16859 for (unsigned int i
= 0; i
< n_elts
; i
++)
16861 /* The vector is provided in gcc endian-neutral fashion.
16862 For aarch64_be Advanced SIMD, it must be laid out in the vector
16863 register in reverse order. */
16864 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
16865 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
16867 if (elt_mode
!= elt_int_mode
)
16868 elt
= gen_lowpart (elt_int_mode
, elt
);
16870 if (!CONST_INT_P (elt
))
16873 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
16874 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
16876 bytes
.quick_push (elt_val
& 0xff);
16877 elt_val
>>= BITS_PER_UNIT
;
16881 /* The immediate must repeat every eight bytes. */
16882 unsigned int nbytes
= bytes
.length ();
16883 for (unsigned i
= 8; i
< nbytes
; ++i
)
16884 if (bytes
[i
] != bytes
[i
- 8])
16887 /* Get the repeating 8-byte value as an integer. No endian correction
16888 is needed here because bytes is already in lsb-first order. */
16889 unsigned HOST_WIDE_INT val64
= 0;
16890 for (unsigned int i
= 0; i
< 8; i
++)
16891 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
16892 << (i
* BITS_PER_UNIT
));
16894 if (vec_flags
& VEC_SVE_DATA
)
16895 return aarch64_sve_valid_immediate (val64
, info
);
16897 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
16900 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16901 has a step in the range of INDEX. Return the index expression if so,
16902 otherwise return null. */
16904 aarch64_check_zero_based_sve_index_immediate (rtx x
)
16907 if (const_vec_series_p (x
, &base
, &step
)
16908 && base
== const0_rtx
16909 && aarch64_sve_index_immediate_p (step
))
16914 /* Check of immediate shift constants are within range. */
16916 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
16918 x
= unwrap_const_vec_duplicate (x
);
16919 if (!CONST_INT_P (x
))
16921 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
16923 return IN_RANGE (INTVAL (x
), 0, bit_width
- 1);
16925 return IN_RANGE (INTVAL (x
), 1, bit_width
);
16928 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16929 operation of width WIDTH at bit position POS. */
16932 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
16934 gcc_assert (CONST_INT_P (width
));
16935 gcc_assert (CONST_INT_P (pos
));
16937 unsigned HOST_WIDE_INT mask
16938 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
16939 return GEN_INT (mask
<< UINTVAL (pos
));
16943 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
16945 if (GET_CODE (x
) == HIGH
16946 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
16949 if (CONST_INT_P (x
))
16952 if (VECTOR_MODE_P (GET_MODE (x
)))
16954 /* Require predicate constants to be VNx16BI before RA, so that we
16955 force everything to have a canonical form. */
16956 if (!lra_in_progress
16957 && !reload_completed
16958 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
16959 && GET_MODE (x
) != VNx16BImode
)
16962 return aarch64_simd_valid_immediate (x
, NULL
);
16965 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
16968 if (TARGET_SVE
&& aarch64_sve_cnt_immediate_p (x
))
16971 return aarch64_classify_symbolic_expression (x
)
16972 == SYMBOL_TINY_ABSOLUTE
;
16975 /* Return a const_int vector of VAL. */
16977 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
16979 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
16980 return gen_const_vec_duplicate (mode
, c
);
16983 /* Check OP is a legal scalar immediate for the MOVI instruction. */
16986 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
16988 machine_mode vmode
;
16990 vmode
= aarch64_simd_container_mode (mode
, 64);
16991 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
16992 return aarch64_simd_valid_immediate (op_v
, NULL
);
16995 /* Construct and return a PARALLEL RTX vector with elements numbering the
16996 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16997 the vector - from the perspective of the architecture. This does not
16998 line up with GCC's perspective on lane numbers, so we end up with
16999 different masks depending on our target endian-ness. The diagram
17000 below may help. We must draw the distinction when building masks
17001 which select one half of the vector. An instruction selecting
17002 architectural low-lanes for a big-endian target, must be described using
17003 a mask selecting GCC high-lanes.
17005 Big-Endian Little-Endian
17007 GCC 0 1 2 3 3 2 1 0
17008 | x | x | x | x | | x | x | x | x |
17009 Architecture 3 2 1 0 3 2 1 0
17011 Low Mask: { 2, 3 } { 0, 1 }
17012 High Mask: { 0, 1 } { 2, 3 }
17014 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17017 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
17019 rtvec v
= rtvec_alloc (nunits
/ 2);
17020 int high_base
= nunits
/ 2;
17026 if (BYTES_BIG_ENDIAN
)
17027 base
= high
? low_base
: high_base
;
17029 base
= high
? high_base
: low_base
;
17031 for (i
= 0; i
< nunits
/ 2; i
++)
17032 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
17034 t1
= gen_rtx_PARALLEL (mode
, v
);
17038 /* Check OP for validity as a PARALLEL RTX vector with elements
17039 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17040 from the perspective of the architecture. See the diagram above
17041 aarch64_simd_vect_par_cnst_half for more details. */
17044 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
17048 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
17051 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
17052 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
17053 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
17056 if (count_op
!= count_ideal
)
17059 for (i
= 0; i
< count_ideal
; i
++)
17061 rtx elt_op
= XVECEXP (op
, 0, i
);
17062 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
17064 if (!CONST_INT_P (elt_op
)
17065 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
17071 /* Return a PARALLEL containing NELTS elements, with element I equal
17072 to BASE + I * STEP. */
17075 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
17077 rtvec vec
= rtvec_alloc (nelts
);
17078 for (unsigned int i
= 0; i
< nelts
; ++i
)
17079 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
17080 return gen_rtx_PARALLEL (VOIDmode
, vec
);
17083 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17084 series with step STEP. */
17087 aarch64_stepped_int_parallel_p (rtx op
, int step
)
17089 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
17092 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
17093 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
17094 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
17095 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
17101 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17102 HIGH (exclusive). */
17104 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
17107 HOST_WIDE_INT lane
;
17108 gcc_assert (CONST_INT_P (operand
));
17109 lane
= INTVAL (operand
);
17111 if (lane
< low
|| lane
>= high
)
17114 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
17116 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
17120 /* Peform endian correction on lane number N, which indexes a vector
17121 of mode MODE, and return the result as an SImode rtx. */
17124 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
17126 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
17129 /* Return TRUE if OP is a valid vector addressing mode. */
17132 aarch64_simd_mem_operand_p (rtx op
)
17134 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
17135 || REG_P (XEXP (op
, 0)));
17138 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17141 aarch64_sve_ld1r_operand_p (rtx op
)
17143 struct aarch64_address_info addr
;
17147 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
17148 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
17149 && addr
.type
== ADDRESS_REG_IMM
17150 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
17153 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
17154 where the size of the read data is specified by `mode` and the size of the
17155 vector elements are specified by `elem_mode`. */
17157 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op
, machine_mode mode
,
17158 scalar_mode elem_mode
)
17160 struct aarch64_address_info addr
;
17162 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
17165 if (addr
.type
== ADDRESS_REG_IMM
)
17166 return offset_4bit_signed_scaled_p (mode
, addr
.const_offset
);
17168 if (addr
.type
== ADDRESS_REG_REG
)
17169 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
17174 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17176 aarch64_sve_ld1rq_operand_p (rtx op
)
17178 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, TImode
,
17179 GET_MODE_INNER (GET_MODE (op
)));
17182 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
17183 accessing a vector where the element size is specified by `elem_mode`. */
17185 aarch64_sve_ld1ro_operand_p (rtx op
, scalar_mode elem_mode
)
17187 return aarch64_sve_ld1rq_ld1ro_operand_p (op
, OImode
, elem_mode
);
17190 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17192 aarch64_sve_ldff1_operand_p (rtx op
)
17197 struct aarch64_address_info addr
;
17198 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
), false))
17201 if (addr
.type
== ADDRESS_REG_IMM
)
17202 return known_eq (addr
.const_offset
, 0);
17204 return addr
.type
== ADDRESS_REG_REG
;
17207 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17209 aarch64_sve_ldnf1_operand_p (rtx op
)
17211 struct aarch64_address_info addr
;
17214 && aarch64_classify_address (&addr
, XEXP (op
, 0),
17215 GET_MODE (op
), false)
17216 && addr
.type
== ADDRESS_REG_IMM
);
17219 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17220 The conditions for STR are the same. */
17222 aarch64_sve_ldr_operand_p (rtx op
)
17224 struct aarch64_address_info addr
;
17227 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
17228 false, ADDR_QUERY_ANY
)
17229 && addr
.type
== ADDRESS_REG_IMM
);
17232 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17233 addressing memory of mode MODE. */
17235 aarch64_sve_prefetch_operand_p (rtx op
, machine_mode mode
)
17237 struct aarch64_address_info addr
;
17238 if (!aarch64_classify_address (&addr
, op
, mode
, false))
17241 if (addr
.type
== ADDRESS_REG_IMM
)
17242 return known_eq (addr
.const_offset
, 0);
17244 return addr
.type
== ADDRESS_REG_REG
;
17247 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17248 We need to be able to access the individual pieces, so the range
17249 is different from LD[234] and ST[234]. */
17251 aarch64_sve_struct_memory_operand_p (rtx op
)
17256 machine_mode mode
= GET_MODE (op
);
17257 struct aarch64_address_info addr
;
17258 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
17260 || addr
.type
!= ADDRESS_REG_IMM
)
17263 poly_int64 first
= addr
.const_offset
;
17264 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
17265 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
17266 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
17269 /* Emit a register copy from operand to operand, taking care not to
17270 early-clobber source registers in the process.
17272 COUNT is the number of components into which the copy needs to be
17275 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
17276 unsigned int count
)
17279 int rdest
= REGNO (operands
[0]);
17280 int rsrc
= REGNO (operands
[1]);
17282 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
17284 for (i
= 0; i
< count
; i
++)
17285 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
17286 gen_rtx_REG (mode
, rsrc
+ i
));
17288 for (i
= 0; i
< count
; i
++)
17289 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
17290 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
17293 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17294 one of VSTRUCT modes: OI, CI, or XI. */
17296 aarch64_simd_attr_length_rglist (machine_mode mode
)
17298 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17299 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
17302 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17303 alignment of a vector to 128 bits. SVE predicates have an alignment of
17305 static HOST_WIDE_INT
17306 aarch64_simd_vector_alignment (const_tree type
)
17308 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17309 be set for non-predicate vectors of booleans. Modes are the most
17310 direct way we have of identifying real SVE predicate types. */
17311 if (GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
)
17313 widest_int min_size
17314 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type
)));
17315 return wi::umin (min_size
, 128).to_uhwi ();
17318 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17320 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
17322 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
17324 /* If the length of the vector is fixed, try to align to that length,
17325 otherwise don't try to align at all. */
17326 HOST_WIDE_INT result
;
17327 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
17328 result
= TYPE_ALIGN (TREE_TYPE (type
));
17331 return TYPE_ALIGN (type
);
17334 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17336 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
17341 /* For fixed-length vectors, check that the vectorizer will aim for
17342 full-vector alignment. This isn't true for generic GCC vectors
17343 that are wider than the ABI maximum of 128 bits. */
17344 poly_uint64 preferred_alignment
=
17345 aarch64_vectorize_preferred_vector_alignment (type
);
17346 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
17347 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
17348 preferred_alignment
))
17351 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17355 /* Return true if the vector misalignment factor is supported by the
17358 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
17359 const_tree type
, int misalignment
,
17362 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
17364 /* Return if movmisalign pattern is not supported for this mode. */
17365 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
17368 /* Misalignment factor is unknown at compile time. */
17369 if (misalignment
== -1)
17372 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
17376 /* If VALS is a vector constant that can be loaded into a register
17377 using DUP, generate instructions to do so and return an RTX to
17378 assign to the register. Otherwise return NULL_RTX. */
17380 aarch64_simd_dup_constant (rtx vals
)
17382 machine_mode mode
= GET_MODE (vals
);
17383 machine_mode inner_mode
= GET_MODE_INNER (mode
);
17386 if (!const_vec_duplicate_p (vals
, &x
))
17389 /* We can load this constant by using DUP and a constant in a
17390 single ARM register. This will be cheaper than a vector
17392 x
= copy_to_mode_reg (inner_mode
, x
);
17393 return gen_vec_duplicate (mode
, x
);
17397 /* Generate code to load VALS, which is a PARALLEL containing only
17398 constants (for vec_init) or CONST_VECTOR, efficiently into a
17399 register. Returns an RTX to copy into the register, or NULL_RTX
17400 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17402 aarch64_simd_make_constant (rtx vals
)
17404 machine_mode mode
= GET_MODE (vals
);
17406 rtx const_vec
= NULL_RTX
;
17410 if (GET_CODE (vals
) == CONST_VECTOR
)
17412 else if (GET_CODE (vals
) == PARALLEL
)
17414 /* A CONST_VECTOR must contain only CONST_INTs and
17415 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17416 Only store valid constants in a CONST_VECTOR. */
17417 int n_elts
= XVECLEN (vals
, 0);
17418 for (i
= 0; i
< n_elts
; ++i
)
17420 rtx x
= XVECEXP (vals
, 0, i
);
17421 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17424 if (n_const
== n_elts
)
17425 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
17428 gcc_unreachable ();
17430 if (const_vec
!= NULL_RTX
17431 && aarch64_simd_valid_immediate (const_vec
, NULL
))
17432 /* Load using MOVI/MVNI. */
17434 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
17435 /* Loaded using DUP. */
17437 else if (const_vec
!= NULL_RTX
)
17438 /* Load from constant pool. We cannot take advantage of single-cycle
17439 LD1 because we need a PC-relative addressing mode. */
17442 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17443 We cannot construct an initializer. */
17447 /* Expand a vector initialisation sequence, such that TARGET is
17448 initialised to contain VALS. */
17451 aarch64_expand_vector_init (rtx target
, rtx vals
)
17453 machine_mode mode
= GET_MODE (target
);
17454 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
17455 /* The number of vector elements. */
17456 int n_elts
= XVECLEN (vals
, 0);
17457 /* The number of vector elements which are not constant. */
17459 rtx any_const
= NULL_RTX
;
17460 /* The first element of vals. */
17461 rtx v0
= XVECEXP (vals
, 0, 0);
17462 bool all_same
= true;
17464 /* This is a special vec_init<M><N> where N is not an element mode but a
17465 vector mode with half the elements of M. We expect to find two entries
17466 of mode N in VALS and we must put their concatentation into TARGET. */
17467 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
17469 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
17470 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
17471 rtx lo
= XVECEXP (vals
, 0, 0);
17472 rtx hi
= XVECEXP (vals
, 0, 1);
17473 machine_mode narrow_mode
= GET_MODE (lo
);
17474 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
17475 gcc_assert (narrow_mode
== GET_MODE (hi
));
17477 /* When we want to concatenate a half-width vector with zeroes we can
17478 use the aarch64_combinez[_be] patterns. Just make sure that the
17479 zeroes are in the right half. */
17480 if (BYTES_BIG_ENDIAN
17481 && aarch64_simd_imm_zero (lo
, narrow_mode
)
17482 && general_operand (hi
, narrow_mode
))
17483 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
17484 else if (!BYTES_BIG_ENDIAN
17485 && aarch64_simd_imm_zero (hi
, narrow_mode
)
17486 && general_operand (lo
, narrow_mode
))
17487 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
17490 /* Else create the two half-width registers and combine them. */
17492 lo
= force_reg (GET_MODE (lo
), lo
);
17494 hi
= force_reg (GET_MODE (hi
), hi
);
17496 if (BYTES_BIG_ENDIAN
)
17497 std::swap (lo
, hi
);
17498 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
17503 /* Count the number of variable elements to initialise. */
17504 for (int i
= 0; i
< n_elts
; ++i
)
17506 rtx x
= XVECEXP (vals
, 0, i
);
17507 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
17512 all_same
&= rtx_equal_p (x
, v0
);
17515 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17516 how best to handle this. */
17519 rtx constant
= aarch64_simd_make_constant (vals
);
17520 if (constant
!= NULL_RTX
)
17522 emit_move_insn (target
, constant
);
17527 /* Splat a single non-constant element if we can. */
17530 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
17531 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
17535 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
17536 gcc_assert (icode
!= CODE_FOR_nothing
);
17538 /* If there are only variable elements, try to optimize
17539 the insertion using dup for the most common element
17540 followed by insertions. */
17542 /* The algorithm will fill matches[*][0] with the earliest matching element,
17543 and matches[X][1] with the count of duplicate elements (if X is the
17544 earliest element which has duplicates). */
17546 if (n_var
== n_elts
&& n_elts
<= 16)
17548 int matches
[16][2] = {0};
17549 for (int i
= 0; i
< n_elts
; i
++)
17551 for (int j
= 0; j
<= i
; j
++)
17553 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
17561 int maxelement
= 0;
17563 for (int i
= 0; i
< n_elts
; i
++)
17564 if (matches
[i
][1] > maxv
)
17567 maxv
= matches
[i
][1];
17570 /* Create a duplicate of the most common element, unless all elements
17571 are equally useless to us, in which case just immediately set the
17572 vector register using the first element. */
17576 /* For vectors of two 64-bit elements, we can do even better. */
17578 && (inner_mode
== E_DImode
17579 || inner_mode
== E_DFmode
))
17582 rtx x0
= XVECEXP (vals
, 0, 0);
17583 rtx x1
= XVECEXP (vals
, 0, 1);
17584 /* Combine can pick up this case, but handling it directly
17585 here leaves clearer RTL.
17587 This is load_pair_lanes<mode>, and also gives us a clean-up
17588 for store_pair_lanes<mode>. */
17589 if (memory_operand (x0
, inner_mode
)
17590 && memory_operand (x1
, inner_mode
)
17591 && !STRICT_ALIGNMENT
17592 && rtx_equal_p (XEXP (x1
, 0),
17593 plus_constant (Pmode
,
17595 GET_MODE_SIZE (inner_mode
))))
17598 if (inner_mode
== DFmode
)
17599 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
17601 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
17606 /* The subreg-move sequence below will move into lane zero of the
17607 vector register. For big-endian we want that position to hold
17608 the last element of VALS. */
17609 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
17610 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
17611 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
17615 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
17616 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
17619 /* Insert the rest. */
17620 for (int i
= 0; i
< n_elts
; i
++)
17622 rtx x
= XVECEXP (vals
, 0, i
);
17623 if (matches
[i
][0] == maxelement
)
17625 x
= copy_to_mode_reg (inner_mode
, x
);
17626 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
17631 /* Initialise a vector which is part-variable. We want to first try
17632 to build those lanes which are constant in the most efficient way we
17634 if (n_var
!= n_elts
)
17636 rtx copy
= copy_rtx (vals
);
17638 /* Load constant part of vector. We really don't care what goes into the
17639 parts we will overwrite, but we're more likely to be able to load the
17640 constant efficiently if it has fewer, larger, repeating parts
17641 (see aarch64_simd_valid_immediate). */
17642 for (int i
= 0; i
< n_elts
; i
++)
17644 rtx x
= XVECEXP (vals
, 0, i
);
17645 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17647 rtx subst
= any_const
;
17648 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
17650 /* Look in the copied vector, as more elements are const. */
17651 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
17652 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
17658 XVECEXP (copy
, 0, i
) = subst
;
17660 aarch64_expand_vector_init (target
, copy
);
17663 /* Insert the variable lanes directly. */
17664 for (int i
= 0; i
< n_elts
; i
++)
17666 rtx x
= XVECEXP (vals
, 0, i
);
17667 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
17669 x
= copy_to_mode_reg (inner_mode
, x
);
17670 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
17674 /* Emit RTL corresponding to:
17675 insr TARGET, ELEM. */
17678 emit_insr (rtx target
, rtx elem
)
17680 machine_mode mode
= GET_MODE (target
);
17681 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17682 elem
= force_reg (elem_mode
, elem
);
17684 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
17685 gcc_assert (icode
!= CODE_FOR_nothing
);
17686 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
17689 /* Subroutine of aarch64_sve_expand_vector_init for handling
17690 trailing constants.
17691 This function works as follows:
17692 (a) Create a new vector consisting of trailing constants.
17693 (b) Initialize TARGET with the constant vector using emit_move_insn.
17694 (c) Insert remaining elements in TARGET using insr.
17695 NELTS is the total number of elements in original vector while
17696 while NELTS_REQD is the number of elements that are actually
17699 ??? The heuristic used is to do above only if number of constants
17700 is at least half the total number of elements. May need fine tuning. */
17703 aarch64_sve_expand_vector_init_handle_trailing_constants
17704 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
17706 machine_mode mode
= GET_MODE (target
);
17707 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17708 int n_trailing_constants
= 0;
17710 for (int i
= nelts_reqd
- 1;
17711 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
17713 n_trailing_constants
++;
17715 if (n_trailing_constants
>= nelts_reqd
/ 2)
17717 rtx_vector_builder
v (mode
, 1, nelts
);
17718 for (int i
= 0; i
< nelts
; i
++)
17719 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
17720 rtx const_vec
= v
.build ();
17721 emit_move_insn (target
, const_vec
);
17723 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
17724 emit_insr (target
, builder
.elt (i
));
17732 /* Subroutine of aarch64_sve_expand_vector_init.
17734 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17735 (b) Skip trailing elements from BUILDER, which are the same as
17736 element NELTS_REQD - 1.
17737 (c) Insert earlier elements in reverse order in TARGET using insr. */
17740 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
17741 const rtx_vector_builder
&builder
,
17744 machine_mode mode
= GET_MODE (target
);
17745 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
17747 struct expand_operand ops
[2];
17748 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
17749 gcc_assert (icode
!= CODE_FOR_nothing
);
17751 create_output_operand (&ops
[0], target
, mode
);
17752 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
17753 expand_insn (icode
, 2, ops
);
17755 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
17756 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
17757 emit_insr (target
, builder
.elt (i
));
17760 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17761 when all trailing elements of builder are same.
17762 This works as follows:
17763 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17764 (b) Insert remaining elements in TARGET using insr.
17766 ??? The heuristic used is to do above if number of same trailing elements
17767 is at least 3/4 of total number of elements, loosely based on
17768 heuristic from mostly_zeros_p. May need fine-tuning. */
17771 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17772 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
17774 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
17775 if (ndups
>= (3 * nelts_reqd
) / 4)
17777 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
17778 nelts_reqd
- ndups
+ 1);
17785 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17786 of elements in BUILDER.
17788 The function tries to initialize TARGET from BUILDER if it fits one
17789 of the special cases outlined below.
17791 Failing that, the function divides BUILDER into two sub-vectors:
17792 v_even = even elements of BUILDER;
17793 v_odd = odd elements of BUILDER;
17795 and recursively calls itself with v_even and v_odd.
17797 if (recursive call succeeded for v_even or v_odd)
17798 TARGET = zip (v_even, v_odd)
17800 The function returns true if it managed to build TARGET from BUILDER
17801 with one of the special cases, false otherwise.
17803 Example: {a, 1, b, 2, c, 3, d, 4}
17805 The vector gets divided into:
17806 v_even = {a, b, c, d}
17807 v_odd = {1, 2, 3, 4}
17809 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17810 initialize tmp2 from constant vector v_odd using emit_move_insn.
17812 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17813 4 elements, so we construct tmp1 from v_even using insr:
17820 TARGET = zip (tmp1, tmp2)
17821 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17824 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
17825 int nelts
, int nelts_reqd
)
17827 machine_mode mode
= GET_MODE (target
);
17829 /* Case 1: Vector contains trailing constants. */
17831 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17832 (target
, builder
, nelts
, nelts_reqd
))
17835 /* Case 2: Vector contains leading constants. */
17837 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
17838 for (int i
= 0; i
< nelts_reqd
; i
++)
17839 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
17840 rev_builder
.finalize ();
17842 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17843 (target
, rev_builder
, nelts
, nelts_reqd
))
17845 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
17849 /* Case 3: Vector contains trailing same element. */
17851 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17852 (target
, builder
, nelts_reqd
))
17855 /* Case 4: Vector contains leading same element. */
17857 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17858 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
17860 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
17864 /* Avoid recursing below 4-elements.
17865 ??? The threshold 4 may need fine-tuning. */
17867 if (nelts_reqd
<= 4)
17870 rtx_vector_builder
v_even (mode
, 1, nelts
);
17871 rtx_vector_builder
v_odd (mode
, 1, nelts
);
17873 for (int i
= 0; i
< nelts
* 2; i
+= 2)
17875 v_even
.quick_push (builder
.elt (i
));
17876 v_odd
.quick_push (builder
.elt (i
+ 1));
17879 v_even
.finalize ();
17882 rtx tmp1
= gen_reg_rtx (mode
);
17883 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
17884 nelts
, nelts_reqd
/ 2);
17886 rtx tmp2
= gen_reg_rtx (mode
);
17887 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
17888 nelts
, nelts_reqd
/ 2);
17890 if (!did_even_p
&& !did_odd_p
)
17893 /* Initialize v_even and v_odd using INSR if it didn't match any of the
17894 special cases and zip v_even, v_odd. */
17897 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
17900 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
17902 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
17903 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
17907 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
17910 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
17912 machine_mode mode
= GET_MODE (target
);
17913 int nelts
= XVECLEN (vals
, 0);
17915 rtx_vector_builder
v (mode
, 1, nelts
);
17916 for (int i
= 0; i
< nelts
; i
++)
17917 v
.quick_push (XVECEXP (vals
, 0, i
));
17920 /* If neither sub-vectors of v could be initialized specially,
17921 then use INSR to insert all elements from v into TARGET.
17922 ??? This might not be optimal for vectors with large
17923 initializers like 16-element or above.
17924 For nelts < 4, it probably isn't useful to handle specially. */
17927 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
17928 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
17931 /* Check whether VALUE is a vector constant in which every element
17932 is either a power of 2 or a negated power of 2. If so, return
17933 a constant vector of log2s, and flip CODE between PLUS and MINUS
17934 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
17937 aarch64_convert_mult_to_shift (rtx value
, rtx_code
&code
)
17939 if (GET_CODE (value
) != CONST_VECTOR
)
17942 rtx_vector_builder builder
;
17943 if (!builder
.new_unary_operation (GET_MODE (value
), value
, false))
17946 scalar_mode int_mode
= GET_MODE_INNER (GET_MODE (value
));
17947 /* 1 if the result of the multiplication must be negated,
17948 0 if it mustn't, or -1 if we don't yet care. */
17950 unsigned int encoded_nelts
= const_vector_encoded_nelts (value
);
17951 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
17953 rtx elt
= CONST_VECTOR_ENCODED_ELT (value
, i
);
17954 if (!CONST_SCALAR_INT_P (elt
))
17956 rtx_mode_t
val (elt
, int_mode
);
17957 wide_int pow2
= wi::neg (val
);
17960 /* It matters whether we negate or not. Make that choice,
17961 and make sure that it's consistent with previous elements. */
17962 if (negate
== !wi::neg_p (val
))
17964 negate
= wi::neg_p (val
);
17968 /* POW2 is now the value that we want to be a power of 2. */
17969 int shift
= wi::exact_log2 (pow2
);
17972 builder
.quick_push (gen_int_mode (shift
, int_mode
));
17975 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
17977 else if (negate
== 1)
17978 code
= code
== PLUS
? MINUS
: PLUS
;
17979 return builder
.build ();
17982 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17983 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
17984 operands array, in the same order as for fma_optab. Return true if
17985 the function emitted all the necessary instructions, false if the caller
17986 should generate the pattern normally with the new OPERANDS array. */
17989 aarch64_prepare_sve_int_fma (rtx
*operands
, rtx_code code
)
17991 machine_mode mode
= GET_MODE (operands
[0]);
17992 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[2], code
))
17994 rtx product
= expand_binop (mode
, vashl_optab
, operands
[1], shifts
,
17995 NULL_RTX
, true, OPTAB_DIRECT
);
17996 force_expand_binop (mode
, code
== PLUS
? add_optab
: sub_optab
,
17997 operands
[3], product
, operands
[0], true,
18001 operands
[2] = force_reg (mode
, operands
[2]);
18005 /* Likewise, but for a conditional pattern. */
18008 aarch64_prepare_sve_cond_int_fma (rtx
*operands
, rtx_code code
)
18010 machine_mode mode
= GET_MODE (operands
[0]);
18011 if (rtx shifts
= aarch64_convert_mult_to_shift (operands
[3], code
))
18013 rtx product
= expand_binop (mode
, vashl_optab
, operands
[2], shifts
,
18014 NULL_RTX
, true, OPTAB_DIRECT
);
18015 emit_insn (gen_cond (code
, mode
, operands
[0], operands
[1],
18016 operands
[4], product
, operands
[5]));
18019 operands
[3] = force_reg (mode
, operands
[3]);
18023 static unsigned HOST_WIDE_INT
18024 aarch64_shift_truncation_mask (machine_mode mode
)
18026 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
18028 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
18031 /* Select a format to encode pointers in exception handling data. */
18033 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
18036 switch (aarch64_cmodel
)
18038 case AARCH64_CMODEL_TINY
:
18039 case AARCH64_CMODEL_TINY_PIC
:
18040 case AARCH64_CMODEL_SMALL
:
18041 case AARCH64_CMODEL_SMALL_PIC
:
18042 case AARCH64_CMODEL_SMALL_SPIC
:
18043 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18045 type
= DW_EH_PE_sdata4
;
18048 /* No assumptions here. 8-byte relocs required. */
18049 type
= DW_EH_PE_sdata8
;
18052 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
18055 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18058 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
18060 if (TREE_CODE (decl
) == FUNCTION_DECL
)
18062 arm_pcs pcs
= (arm_pcs
) fndecl_abi (decl
).id ();
18063 if (pcs
== ARM_PCS_SIMD
|| pcs
== ARM_PCS_SVE
)
18065 fprintf (stream
, "\t.variant_pcs\t");
18066 assemble_name (stream
, name
);
18067 fprintf (stream
, "\n");
18072 /* The last .arch and .tune assembly strings that we printed. */
18073 static std::string aarch64_last_printed_arch_string
;
18074 static std::string aarch64_last_printed_tune_string
;
18076 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18077 by the function fndecl. */
18080 aarch64_declare_function_name (FILE *stream
, const char* name
,
18083 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
18085 struct cl_target_option
*targ_options
;
18087 targ_options
= TREE_TARGET_OPTION (target_parts
);
18089 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
18090 gcc_assert (targ_options
);
18092 const struct processor
*this_arch
18093 = aarch64_get_arch (targ_options
->x_explicit_arch
);
18095 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
18096 std::string extension
18097 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
18099 /* Only update the assembler .arch string if it is distinct from the last
18100 such string we printed. */
18101 std::string to_print
= this_arch
->name
+ extension
;
18102 if (to_print
!= aarch64_last_printed_arch_string
)
18104 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
18105 aarch64_last_printed_arch_string
= to_print
;
18108 /* Print the cpu name we're tuning for in the comments, might be
18109 useful to readers of the generated asm. Do it only when it changes
18110 from function to function and verbose assembly is requested. */
18111 const struct processor
*this_tune
18112 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
18114 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
18116 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
18118 aarch64_last_printed_tune_string
= this_tune
->name
;
18121 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
18123 /* Don't forget the type directive for ELF. */
18124 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
18125 ASM_OUTPUT_LABEL (stream
, name
);
18128 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18131 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
18133 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
18134 const char *value
= IDENTIFIER_POINTER (target
);
18135 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18136 ASM_OUTPUT_DEF (stream
, name
, value
);
18139 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18140 function symbol references. */
18143 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
18145 default_elf_asm_output_external (stream
, decl
, name
);
18146 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
18149 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18150 Used to output the .cfi_b_key_frame directive when signing the current
18151 function with the B key. */
18154 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
18156 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
18157 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
18158 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
18161 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18164 aarch64_start_file (void)
18166 struct cl_target_option
*default_options
18167 = TREE_TARGET_OPTION (target_option_default_node
);
18169 const struct processor
*default_arch
18170 = aarch64_get_arch (default_options
->x_explicit_arch
);
18171 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
18172 std::string extension
18173 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
18174 default_arch
->flags
);
18176 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
18177 aarch64_last_printed_tune_string
= "";
18178 asm_fprintf (asm_out_file
, "\t.arch %s\n",
18179 aarch64_last_printed_arch_string
.c_str ());
18181 default_file_start ();
18184 /* Emit load exclusive. */
18187 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
18188 rtx mem
, rtx model_rtx
)
18190 if (mode
== TImode
)
18191 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode
, rval
),
18192 gen_highpart (DImode
, rval
),
18195 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
18198 /* Emit store exclusive. */
18201 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
18202 rtx mem
, rtx rval
, rtx model_rtx
)
18204 if (mode
== TImode
)
18205 emit_insn (gen_aarch64_store_exclusive_pair
18206 (bval
, mem
, operand_subword (rval
, 0, 0, TImode
),
18207 operand_subword (rval
, 1, 0, TImode
), model_rtx
));
18209 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, mem
, rval
, model_rtx
));
18212 /* Mark the previous jump instruction as unlikely. */
18215 aarch64_emit_unlikely_jump (rtx insn
)
18217 rtx_insn
*jump
= emit_jump_insn (insn
);
18218 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
18221 /* We store the names of the various atomic helpers in a 5x4 array.
18222 Return the libcall function given MODE, MODEL and NAMES. */
18225 aarch64_atomic_ool_func(machine_mode mode
, rtx model_rtx
,
18226 const atomic_ool_names
*names
)
18228 memmodel model
= memmodel_base (INTVAL (model_rtx
));
18229 int mode_idx
, model_idx
;
18249 gcc_unreachable ();
18254 case MEMMODEL_RELAXED
:
18257 case MEMMODEL_CONSUME
:
18258 case MEMMODEL_ACQUIRE
:
18261 case MEMMODEL_RELEASE
:
18264 case MEMMODEL_ACQ_REL
:
18265 case MEMMODEL_SEQ_CST
:
18269 gcc_unreachable ();
18272 return init_one_libfunc_visibility (names
->str
[mode_idx
][model_idx
],
18273 VISIBILITY_HIDDEN
);
18276 #define DEF0(B, N) \
18277 { "__aarch64_" #B #N "_relax", \
18278 "__aarch64_" #B #N "_acq", \
18279 "__aarch64_" #B #N "_rel", \
18280 "__aarch64_" #B #N "_acq_rel" }
18282 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18283 { NULL, NULL, NULL, NULL }
18284 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18286 static const atomic_ool_names aarch64_ool_cas_names
= { { DEF5(cas
) } };
18287 const atomic_ool_names aarch64_ool_swp_names
= { { DEF4(swp
) } };
18288 const atomic_ool_names aarch64_ool_ldadd_names
= { { DEF4(ldadd
) } };
18289 const atomic_ool_names aarch64_ool_ldset_names
= { { DEF4(ldset
) } };
18290 const atomic_ool_names aarch64_ool_ldclr_names
= { { DEF4(ldclr
) } };
18291 const atomic_ool_names aarch64_ool_ldeor_names
= { { DEF4(ldeor
) } };
18297 /* Expand a compare and swap pattern. */
18300 aarch64_expand_compare_and_swap (rtx operands
[])
18302 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
18303 machine_mode mode
, r_mode
;
18305 bval
= operands
[0];
18306 rval
= operands
[1];
18308 oldval
= operands
[3];
18309 newval
= operands
[4];
18310 is_weak
= operands
[5];
18311 mod_s
= operands
[6];
18312 mod_f
= operands
[7];
18313 mode
= GET_MODE (mem
);
18315 /* Normally the succ memory model must be stronger than fail, but in the
18316 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18317 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18318 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
18319 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
18320 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
18323 if (mode
== QImode
|| mode
== HImode
)
18326 rval
= gen_reg_rtx (r_mode
);
18331 /* The CAS insn requires oldval and rval overlap, but we need to
18332 have a copy of oldval saved across the operation to tell if
18333 the operation is successful. */
18334 if (reg_overlap_mentioned_p (rval
, oldval
))
18335 rval
= copy_to_mode_reg (r_mode
, oldval
);
18337 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
18339 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
18341 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18343 else if (TARGET_OUTLINE_ATOMICS
)
18345 /* Oldval must satisfy compare afterward. */
18346 if (!aarch64_plus_operand (oldval
, mode
))
18347 oldval
= force_reg (mode
, oldval
);
18348 rtx func
= aarch64_atomic_ool_func (mode
, mod_s
, &aarch64_ool_cas_names
);
18349 rval
= emit_library_call_value (func
, NULL_RTX
, LCT_NORMAL
, r_mode
,
18350 oldval
, mode
, newval
, mode
,
18351 XEXP (mem
, 0), Pmode
);
18352 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18356 /* The oldval predicate varies by mode. Test it and force to reg. */
18357 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
18358 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
18359 oldval
= force_reg (mode
, oldval
);
18361 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
18362 is_weak
, mod_s
, mod_f
));
18363 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
18366 if (r_mode
!= mode
)
18367 rval
= gen_lowpart (mode
, rval
);
18368 emit_move_insn (operands
[1], rval
);
18370 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
18371 emit_insn (gen_rtx_SET (bval
, x
));
18374 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18375 sequence implementing an atomic operation. */
18378 aarch64_emit_post_barrier (enum memmodel model
)
18380 const enum memmodel base_model
= memmodel_base (model
);
18382 if (is_mm_sync (model
)
18383 && (base_model
== MEMMODEL_ACQUIRE
18384 || base_model
== MEMMODEL_ACQ_REL
18385 || base_model
== MEMMODEL_SEQ_CST
))
18387 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
18391 /* Split a compare and swap pattern. */
18394 aarch64_split_compare_and_swap (rtx operands
[])
18396 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18397 gcc_assert (epilogue_completed
);
18399 rtx rval
, mem
, oldval
, newval
, scratch
, x
, model_rtx
;
18402 rtx_code_label
*label1
, *label2
;
18403 enum memmodel model
;
18405 rval
= operands
[0];
18407 oldval
= operands
[2];
18408 newval
= operands
[3];
18409 is_weak
= (operands
[4] != const0_rtx
);
18410 model_rtx
= operands
[5];
18411 scratch
= operands
[7];
18412 mode
= GET_MODE (mem
);
18413 model
= memmodel_from_int (INTVAL (model_rtx
));
18415 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18418 LD[A]XR rval, [mem]
18420 ST[L]XR scratch, newval, [mem]
18421 CBNZ scratch, .label1
18424 bool strong_zero_p
= (!is_weak
&& !aarch64_track_speculation
&&
18425 oldval
== const0_rtx
&& mode
!= TImode
);
18430 label1
= gen_label_rtx ();
18431 emit_label (label1
);
18433 label2
= gen_label_rtx ();
18435 /* The initial load can be relaxed for a __sync operation since a final
18436 barrier will be emitted to stop code hoisting. */
18437 if (is_mm_sync (model
))
18438 aarch64_emit_load_exclusive (mode
, rval
, mem
, GEN_INT (MEMMODEL_RELAXED
));
18440 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
18443 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
18446 rtx cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
18447 x
= gen_rtx_NE (VOIDmode
, cc_reg
, const0_rtx
);
18449 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18450 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
18451 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18453 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
18457 if (aarch64_track_speculation
)
18459 /* Emit an explicit compare instruction, so that we can correctly
18460 track the condition codes. */
18461 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
18462 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
18465 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
18467 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18468 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
18469 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18472 aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
18474 emit_label (label2
);
18476 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18477 to set the condition flags. If this is not used it will be removed by
18480 aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
18482 /* Emit any final barrier needed for a __sync operation. */
18483 if (is_mm_sync (model
))
18484 aarch64_emit_post_barrier (model
);
18487 /* Split an atomic operation. */
18490 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
18491 rtx value
, rtx model_rtx
, rtx cond
)
18493 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
18494 gcc_assert (epilogue_completed
);
18496 machine_mode mode
= GET_MODE (mem
);
18497 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
18498 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
18499 const bool is_sync
= is_mm_sync (model
);
18500 rtx_code_label
*label
;
18503 /* Split the atomic operation into a sequence. */
18504 label
= gen_label_rtx ();
18505 emit_label (label
);
18508 new_out
= gen_lowpart (wmode
, new_out
);
18510 old_out
= gen_lowpart (wmode
, old_out
);
18513 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
18515 /* The initial load can be relaxed for a __sync operation since a final
18516 barrier will be emitted to stop code hoisting. */
18518 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
18519 GEN_INT (MEMMODEL_RELAXED
));
18521 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
18530 x
= gen_rtx_AND (wmode
, old_out
, value
);
18531 emit_insn (gen_rtx_SET (new_out
, x
));
18532 x
= gen_rtx_NOT (wmode
, new_out
);
18533 emit_insn (gen_rtx_SET (new_out
, x
));
18537 if (CONST_INT_P (value
))
18539 value
= GEN_INT (-INTVAL (value
));
18542 /* Fall through. */
18545 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
18546 emit_insn (gen_rtx_SET (new_out
, x
));
18550 aarch64_emit_store_exclusive (mode
, cond
, mem
,
18551 gen_lowpart (mode
, new_out
), model_rtx
);
18553 if (aarch64_track_speculation
)
18555 /* Emit an explicit compare instruction, so that we can correctly
18556 track the condition codes. */
18557 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
18558 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
18561 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
18563 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18564 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
18565 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18567 /* Emit any final barrier needed for a __sync operation. */
18569 aarch64_emit_post_barrier (model
);
18573 aarch64_init_libfuncs (void)
18575 /* Half-precision float operations. The compiler handles all operations
18576 with NULL libfuncs by converting to SFmode. */
18579 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
18580 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
18583 set_optab_libfunc (add_optab
, HFmode
, NULL
);
18584 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
18585 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
18586 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
18587 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
18590 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
18591 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
18592 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
18593 set_optab_libfunc (le_optab
, HFmode
, NULL
);
18594 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
18595 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
18596 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
18599 /* Target hook for c_mode_for_suffix. */
18600 static machine_mode
18601 aarch64_c_mode_for_suffix (char suffix
)
18609 /* We can only represent floating point constants which will fit in
18610 "quarter-precision" values. These values are characterised by
18611 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18614 (-1)^s * (n/16) * 2^r
18617 's' is the sign bit.
18618 'n' is an integer in the range 16 <= n <= 31.
18619 'r' is an integer in the range -3 <= r <= 4. */
18621 /* Return true iff X can be represented by a quarter-precision
18622 floating point immediate operand X. Note, we cannot represent 0.0. */
18624 aarch64_float_const_representable_p (rtx x
)
18626 /* This represents our current view of how many bits
18627 make up the mantissa. */
18628 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
18630 unsigned HOST_WIDE_INT mantissa
, mask
;
18631 REAL_VALUE_TYPE r
, m
;
18634 x
= unwrap_const_vec_duplicate (x
);
18635 if (!CONST_DOUBLE_P (x
))
18638 if (GET_MODE (x
) == VOIDmode
18639 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
18642 r
= *CONST_DOUBLE_REAL_VALUE (x
);
18644 /* We cannot represent infinities, NaNs or +/-zero. We won't
18645 know if we have +zero until we analyse the mantissa, but we
18646 can reject the other invalid values. */
18647 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
18648 || REAL_VALUE_MINUS_ZERO (r
))
18651 /* Extract exponent. */
18652 r
= real_value_abs (&r
);
18653 exponent
= REAL_EXP (&r
);
18655 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18656 highest (sign) bit, with a fixed binary point at bit point_pos.
18657 m1 holds the low part of the mantissa, m2 the high part.
18658 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18659 bits for the mantissa, this can fail (low bits will be lost). */
18660 real_ldexp (&m
, &r
, point_pos
- exponent
);
18661 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
18663 /* If the low part of the mantissa has bits set we cannot represent
18665 if (w
.ulow () != 0)
18667 /* We have rejected the lower HOST_WIDE_INT, so update our
18668 understanding of how many bits lie in the mantissa and
18669 look only at the high HOST_WIDE_INT. */
18670 mantissa
= w
.elt (1);
18671 point_pos
-= HOST_BITS_PER_WIDE_INT
;
18673 /* We can only represent values with a mantissa of the form 1.xxxx. */
18674 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
18675 if ((mantissa
& mask
) != 0)
18678 /* Having filtered unrepresentable values, we may now remove all
18679 but the highest 5 bits. */
18680 mantissa
>>= point_pos
- 5;
18682 /* We cannot represent the value 0.0, so reject it. This is handled
18687 /* Then, as bit 4 is always set, we can mask it off, leaving
18688 the mantissa in the range [0, 15]. */
18689 mantissa
&= ~(1 << 4);
18690 gcc_assert (mantissa
<= 15);
18692 /* GCC internally does not use IEEE754-like encoding (where normalized
18693 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18694 Our mantissa values are shifted 4 places to the left relative to
18695 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18696 by 5 places to correct for GCC's representation. */
18697 exponent
= 5 - exponent
;
18699 return (exponent
>= 0 && exponent
<= 7);
18702 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18703 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18704 output MOVI/MVNI, ORR or BIC immediate. */
18706 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
18707 enum simd_immediate_check which
)
18710 static char templ
[40];
18711 const char *mnemonic
;
18712 const char *shift_op
;
18713 unsigned int lane_count
= 0;
18716 struct simd_immediate_info info
;
18718 /* This will return true to show const_vector is legal for use as either
18719 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18720 It will also update INFO to show how the immediate should be generated.
18721 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18722 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
18723 gcc_assert (is_valid
);
18725 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18726 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
18728 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
18730 gcc_assert (info
.insn
== simd_immediate_info::MOV
18731 && info
.u
.mov
.shift
== 0);
18732 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18733 move immediate path. */
18734 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
18735 info
.u
.mov
.value
= GEN_INT (0);
18738 const unsigned int buf_size
= 20;
18739 char float_buf
[buf_size
] = {'\0'};
18740 real_to_decimal_for_mode (float_buf
,
18741 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
18742 buf_size
, buf_size
, 1, info
.elt_mode
);
18744 if (lane_count
== 1)
18745 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
18747 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
18748 lane_count
, element_char
, float_buf
);
18753 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
18755 if (which
== AARCH64_CHECK_MOV
)
18757 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
18758 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
18760 if (lane_count
== 1)
18761 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
18762 mnemonic
, UINTVAL (info
.u
.mov
.value
));
18763 else if (info
.u
.mov
.shift
)
18764 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
18765 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
18766 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
18769 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
18770 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
18771 element_char
, UINTVAL (info
.u
.mov
.value
));
18775 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18776 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
18777 if (info
.u
.mov
.shift
)
18778 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
18779 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
18780 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
18783 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
18784 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
18785 element_char
, UINTVAL (info
.u
.mov
.value
));
18791 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
18794 /* If a floating point number was passed and we desire to use it in an
18795 integer mode do the conversion to integer. */
18796 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
18798 unsigned HOST_WIDE_INT ival
;
18799 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
18800 gcc_unreachable ();
18801 immediate
= gen_int_mode (ival
, mode
);
18804 machine_mode vmode
;
18805 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18806 a 128 bit vector mode. */
18807 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
18809 vmode
= aarch64_simd_container_mode (mode
, width
);
18810 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
18811 return aarch64_output_simd_mov_immediate (v_op
, width
);
18814 /* Return the output string to use for moving immediate CONST_VECTOR
18815 into an SVE register. */
18818 aarch64_output_sve_mov_immediate (rtx const_vector
)
18820 static char templ
[40];
18821 struct simd_immediate_info info
;
18824 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
18825 gcc_assert (is_valid
);
18827 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18829 machine_mode vec_mode
= GET_MODE (const_vector
);
18830 if (aarch64_sve_pred_mode_p (vec_mode
))
18832 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
18833 if (info
.insn
== simd_immediate_info::MOV
)
18835 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
18836 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
18840 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
18841 unsigned int total_bytes
;
18842 if (info
.u
.pattern
== AARCH64_SV_ALL
18843 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
18844 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
18845 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
18847 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
18848 svpattern_token (info
.u
.pattern
));
18853 if (info
.insn
== simd_immediate_info::INDEX
)
18855 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
18856 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
18857 element_char
, INTVAL (info
.u
.index
.base
),
18858 INTVAL (info
.u
.index
.step
));
18862 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
18864 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
18865 info
.u
.mov
.value
= GEN_INT (0);
18868 const int buf_size
= 20;
18869 char float_buf
[buf_size
] = {};
18870 real_to_decimal_for_mode (float_buf
,
18871 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
18872 buf_size
, buf_size
, 1, info
.elt_mode
);
18874 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
18875 element_char
, float_buf
);
18880 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
18881 element_char
, INTVAL (info
.u
.mov
.value
));
18885 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
18886 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18890 aarch64_output_sve_ptrues (rtx const_unspec
)
18892 static char templ
[40];
18894 struct simd_immediate_info info
;
18895 bool is_valid
= aarch64_simd_valid_immediate (const_unspec
, &info
);
18896 gcc_assert (is_valid
&& info
.insn
== simd_immediate_info::PTRUE
);
18898 char element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
18899 snprintf (templ
, sizeof (templ
), "ptrues\t%%0.%c, %s", element_char
,
18900 svpattern_token (info
.u
.pattern
));
18904 /* Split operands into moves from op[1] + op[2] into op[0]. */
18907 aarch64_split_combinev16qi (rtx operands
[3])
18909 unsigned int dest
= REGNO (operands
[0]);
18910 unsigned int src1
= REGNO (operands
[1]);
18911 unsigned int src2
= REGNO (operands
[2]);
18912 machine_mode halfmode
= GET_MODE (operands
[1]);
18913 unsigned int halfregs
= REG_NREGS (operands
[1]);
18914 rtx destlo
, desthi
;
18916 gcc_assert (halfmode
== V16QImode
);
18918 if (src1
== dest
&& src2
== dest
+ halfregs
)
18920 /* No-op move. Can't split to nothing; emit something. */
18921 emit_note (NOTE_INSN_DELETED
);
18925 /* Preserve register attributes for variable tracking. */
18926 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
18927 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
18928 GET_MODE_SIZE (halfmode
));
18930 /* Special case of reversed high/low parts. */
18931 if (reg_overlap_mentioned_p (operands
[2], destlo
)
18932 && reg_overlap_mentioned_p (operands
[1], desthi
))
18934 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
18935 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
18936 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
18938 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
18940 /* Try to avoid unnecessary moves if part of the result
18941 is in the right place already. */
18943 emit_move_insn (destlo
, operands
[1]);
18944 if (src2
!= dest
+ halfregs
)
18945 emit_move_insn (desthi
, operands
[2]);
18949 if (src2
!= dest
+ halfregs
)
18950 emit_move_insn (desthi
, operands
[2]);
18952 emit_move_insn (destlo
, operands
[1]);
18956 /* vec_perm support. */
18958 struct expand_vec_perm_d
18960 rtx target
, op0
, op1
;
18961 vec_perm_indices perm
;
18962 machine_mode vmode
;
18963 unsigned int vec_flags
;
18968 /* Generate a variable permutation. */
18971 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
18973 machine_mode vmode
= GET_MODE (target
);
18974 bool one_vector_p
= rtx_equal_p (op0
, op1
);
18976 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
18977 gcc_checking_assert (GET_MODE (op0
) == vmode
);
18978 gcc_checking_assert (GET_MODE (op1
) == vmode
);
18979 gcc_checking_assert (GET_MODE (sel
) == vmode
);
18980 gcc_checking_assert (TARGET_SIMD
);
18984 if (vmode
== V8QImode
)
18986 /* Expand the argument to a V16QI mode by duplicating it. */
18987 rtx pair
= gen_reg_rtx (V16QImode
);
18988 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
18989 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
18993 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
19000 if (vmode
== V8QImode
)
19002 pair
= gen_reg_rtx (V16QImode
);
19003 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
19004 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
19008 pair
= gen_reg_rtx (OImode
);
19009 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
19010 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
19015 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19016 NELT is the number of elements in the vector. */
19019 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
19022 machine_mode vmode
= GET_MODE (target
);
19023 bool one_vector_p
= rtx_equal_p (op0
, op1
);
19026 /* The TBL instruction does not use a modulo index, so we must take care
19027 of that ourselves. */
19028 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
19029 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
19030 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
19032 /* For big-endian, we also need to reverse the index within the vector
19033 (but not which vector). */
19034 if (BYTES_BIG_ENDIAN
)
19036 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19038 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
19039 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
19040 NULL
, 0, OPTAB_LIB_WIDEN
);
19042 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
19045 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19048 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
19050 emit_insn (gen_rtx_SET (target
,
19051 gen_rtx_UNSPEC (GET_MODE (target
),
19052 gen_rtvec (2, op0
, op1
), code
)));
19055 /* Expand an SVE vec_perm with the given operands. */
19058 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
19060 machine_mode data_mode
= GET_MODE (target
);
19061 machine_mode sel_mode
= GET_MODE (sel
);
19062 /* Enforced by the pattern condition. */
19063 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
19065 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19066 size of the two value vectors, i.e. the upper bits of the indices
19067 are effectively ignored. SVE TBL instead produces 0 for any
19068 out-of-range indices, so we need to modulo all the vec_perm indices
19069 to ensure they are all in range. */
19070 rtx sel_reg
= force_reg (sel_mode
, sel
);
19072 /* Check if the sel only references the first values vector. */
19073 if (GET_CODE (sel
) == CONST_VECTOR
19074 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
19076 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
19080 /* Check if the two values vectors are the same. */
19081 if (rtx_equal_p (op0
, op1
))
19083 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
19084 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19085 NULL
, 0, OPTAB_DIRECT
);
19086 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
19090 /* Run TBL on for each value vector and combine the results. */
19092 rtx res0
= gen_reg_rtx (data_mode
);
19093 rtx res1
= gen_reg_rtx (data_mode
);
19094 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
19095 if (GET_CODE (sel
) != CONST_VECTOR
19096 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
19098 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
19100 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
19101 NULL
, 0, OPTAB_DIRECT
);
19103 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
19104 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
19105 NULL
, 0, OPTAB_DIRECT
);
19106 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
19107 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
19108 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
19110 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
19113 /* Recognize patterns suitable for the TRN instructions. */
19115 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
19118 poly_uint64 nelt
= d
->perm
.length ();
19119 rtx out
, in0
, in1
, x
;
19120 machine_mode vmode
= d
->vmode
;
19122 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19125 /* Note that these are little-endian tests.
19126 We correct for big-endian later. */
19127 if (!d
->perm
[0].is_constant (&odd
)
19128 || (odd
!= 0 && odd
!= 1)
19129 || !d
->perm
.series_p (0, 2, odd
, 2)
19130 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
19139 /* We don't need a big-endian lane correction for SVE; see the comment
19140 at the head of aarch64-sve.md for details. */
19141 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19143 x
= in0
, in0
= in1
, in1
= x
;
19148 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19149 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
19153 /* Recognize patterns suitable for the UZP instructions. */
19155 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
19158 rtx out
, in0
, in1
, x
;
19159 machine_mode vmode
= d
->vmode
;
19161 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19164 /* Note that these are little-endian tests.
19165 We correct for big-endian later. */
19166 if (!d
->perm
[0].is_constant (&odd
)
19167 || (odd
!= 0 && odd
!= 1)
19168 || !d
->perm
.series_p (0, 1, odd
, 2))
19177 /* We don't need a big-endian lane correction for SVE; see the comment
19178 at the head of aarch64-sve.md for details. */
19179 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19181 x
= in0
, in0
= in1
, in1
= x
;
19186 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19187 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
19191 /* Recognize patterns suitable for the ZIP instructions. */
19193 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
19196 poly_uint64 nelt
= d
->perm
.length ();
19197 rtx out
, in0
, in1
, x
;
19198 machine_mode vmode
= d
->vmode
;
19200 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
19203 /* Note that these are little-endian tests.
19204 We correct for big-endian later. */
19205 poly_uint64 first
= d
->perm
[0];
19206 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
19207 || !d
->perm
.series_p (0, 2, first
, 1)
19208 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
19210 high
= maybe_ne (first
, 0U);
19218 /* We don't need a big-endian lane correction for SVE; see the comment
19219 at the head of aarch64-sve.md for details. */
19220 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
19222 x
= in0
, in0
= in1
, in1
= x
;
19227 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
19228 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
19232 /* Recognize patterns for the EXT insn. */
19235 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
19237 HOST_WIDE_INT location
;
19240 /* The first element always refers to the first vector.
19241 Check if the extracted indices are increasing by one. */
19242 if (d
->vec_flags
== VEC_SVE_PRED
19243 || !d
->perm
[0].is_constant (&location
)
19244 || !d
->perm
.series_p (0, 1, location
, 1))
19251 /* The case where (location == 0) is a no-op for both big- and little-endian,
19252 and is removed by the mid-end at optimization levels -O1 and higher.
19254 We don't need a big-endian lane correction for SVE; see the comment
19255 at the head of aarch64-sve.md for details. */
19256 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
19258 /* After setup, we want the high elements of the first vector (stored
19259 at the LSB end of the register), and the low elements of the second
19260 vector (stored at the MSB end of the register). So swap. */
19261 std::swap (d
->op0
, d
->op1
);
19262 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19263 to_constant () is safe since this is restricted to Advanced SIMD
19265 location
= d
->perm
.length ().to_constant () - location
;
19268 offset
= GEN_INT (location
);
19269 emit_set_insn (d
->target
,
19270 gen_rtx_UNSPEC (d
->vmode
,
19271 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
19276 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19277 within each 64-bit, 32-bit or 16-bit granule. */
19280 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
19282 HOST_WIDE_INT diff
;
19283 unsigned int i
, size
, unspec
;
19284 machine_mode pred_mode
;
19286 if (d
->vec_flags
== VEC_SVE_PRED
19287 || !d
->one_vector_p
19288 || !d
->perm
[0].is_constant (&diff
))
19291 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
19294 unspec
= UNSPEC_REV64
;
19295 pred_mode
= VNx2BImode
;
19297 else if (size
== 4)
19299 unspec
= UNSPEC_REV32
;
19300 pred_mode
= VNx4BImode
;
19302 else if (size
== 2)
19304 unspec
= UNSPEC_REV16
;
19305 pred_mode
= VNx8BImode
;
19310 unsigned int step
= diff
+ 1;
19311 for (i
= 0; i
< step
; ++i
)
19312 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
19319 if (d
->vec_flags
== VEC_SVE_DATA
)
19321 machine_mode int_mode
= aarch64_sve_int_mode (pred_mode
);
19322 rtx target
= gen_reg_rtx (int_mode
);
19323 if (BYTES_BIG_ENDIAN
)
19324 /* The act of taking a subreg between INT_MODE and d->vmode
19325 is itself a reversing operation on big-endian targets;
19326 see the comment at the head of aarch64-sve.md for details.
19327 First reinterpret OP0 as INT_MODE without using a subreg
19328 and without changing the contents. */
19329 emit_insn (gen_aarch64_sve_reinterpret (int_mode
, target
, d
->op0
));
19332 /* For SVE we use REV[BHW] unspecs derived from the element size
19333 of v->mode and vector modes whose elements have SIZE bytes.
19334 This ensures that the vector modes match the predicate modes. */
19335 int unspec
= aarch64_sve_rev_unspec (d
->vmode
);
19336 rtx pred
= aarch64_ptrue_reg (pred_mode
);
19337 emit_insn (gen_aarch64_pred (unspec
, int_mode
, target
, pred
,
19338 gen_lowpart (int_mode
, d
->op0
)));
19340 emit_move_insn (d
->target
, gen_lowpart (d
->vmode
, target
));
19343 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
19344 emit_set_insn (d
->target
, src
);
19348 /* Recognize patterns for the REV insn, which reverses elements within
19352 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
19354 poly_uint64 nelt
= d
->perm
.length ();
19356 if (!d
->one_vector_p
|| d
->vec_flags
== VEC_ADVSIMD
)
19359 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
19366 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
19367 emit_set_insn (d
->target
, src
);
19372 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
19374 rtx out
= d
->target
;
19377 machine_mode vmode
= d
->vmode
;
19380 if (d
->vec_flags
== VEC_SVE_PRED
19381 || d
->perm
.encoding ().encoded_nelts () != 1
19382 || !d
->perm
[0].is_constant (&elt
))
19385 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
19392 /* The generic preparation in aarch64_expand_vec_perm_const_1
19393 swaps the operand order and the permute indices if it finds
19394 d->perm[0] to be in the second operand. Thus, we can always
19395 use d->op0 and need not do any extra arithmetic to get the
19396 correct lane number. */
19398 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
19400 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
19401 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
19402 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
19407 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
19409 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
19410 machine_mode vmode
= d
->vmode
;
19412 /* Make sure that the indices are constant. */
19413 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
19414 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
19415 if (!d
->perm
[i
].is_constant ())
19421 /* Generic code will try constant permutation twice. Once with the
19422 original mode and again with the elements lowered to QImode.
19423 So wait and don't do the selector expansion ourselves. */
19424 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
19427 /* to_constant is safe since this routine is specific to Advanced SIMD
19429 unsigned int nelt
= d
->perm
.length ().to_constant ();
19430 for (unsigned int i
= 0; i
< nelt
; ++i
)
19431 /* If big-endian and two vectors we end up with a weird mixed-endian
19432 mode on NEON. Reverse the index within each word but not the word
19433 itself. to_constant is safe because we checked is_constant above. */
19434 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
19435 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
19436 : d
->perm
[i
].to_constant ());
19438 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
19439 sel
= force_reg (vmode
, sel
);
19441 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
19445 /* Try to implement D using an SVE TBL instruction. */
19448 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
19450 unsigned HOST_WIDE_INT nelt
;
19452 /* Permuting two variable-length vectors could overflow the
19454 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
19460 machine_mode sel_mode
= related_int_vector_mode (d
->vmode
).require ();
19461 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
19462 if (d
->one_vector_p
)
19463 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
19465 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
19469 /* Try to implement D using SVE SEL instruction. */
19472 aarch64_evpc_sel (struct expand_vec_perm_d
*d
)
19474 machine_mode vmode
= d
->vmode
;
19475 int unit_size
= GET_MODE_UNIT_SIZE (vmode
);
19477 if (d
->vec_flags
!= VEC_SVE_DATA
19481 int n_patterns
= d
->perm
.encoding ().npatterns ();
19482 poly_int64 vec_len
= d
->perm
.length ();
19484 for (int i
= 0; i
< n_patterns
; ++i
)
19485 if (!known_eq (d
->perm
[i
], i
)
19486 && !known_eq (d
->perm
[i
], vec_len
+ i
))
19489 for (int i
= n_patterns
; i
< n_patterns
* 2; i
++)
19490 if (!d
->perm
.series_p (i
, n_patterns
, i
, n_patterns
)
19491 && !d
->perm
.series_p (i
, n_patterns
, vec_len
+ i
, n_patterns
))
19497 machine_mode pred_mode
= aarch64_sve_pred_mode (vmode
);
19499 /* Build a predicate that is true when op0 elements should be used. */
19500 rtx_vector_builder
builder (pred_mode
, n_patterns
, 2);
19501 for (int i
= 0; i
< n_patterns
* 2; i
++)
19503 rtx elem
= known_eq (d
->perm
[i
], i
) ? CONST1_RTX (BImode
)
19504 : CONST0_RTX (BImode
);
19505 builder
.quick_push (elem
);
19508 rtx const_vec
= builder
.build ();
19509 rtx pred
= force_reg (pred_mode
, const_vec
);
19510 /* TARGET = PRED ? OP0 : OP1. */
19511 emit_insn (gen_vcond_mask (vmode
, vmode
, d
->target
, d
->op0
, d
->op1
, pred
));
19516 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
19518 /* The pattern matching functions above are written to look for a small
19519 number to begin the sequence (0, 1, N/2). If we begin with an index
19520 from the second operand, we can swap the operands. */
19521 poly_int64 nelt
= d
->perm
.length ();
19522 if (known_ge (d
->perm
[0], nelt
))
19524 d
->perm
.rotate_inputs (1);
19525 std::swap (d
->op0
, d
->op1
);
19528 if ((d
->vec_flags
== VEC_ADVSIMD
19529 || d
->vec_flags
== VEC_SVE_DATA
19530 || d
->vec_flags
== VEC_SVE_PRED
)
19531 && known_gt (nelt
, 1))
19533 if (aarch64_evpc_rev_local (d
))
19535 else if (aarch64_evpc_rev_global (d
))
19537 else if (aarch64_evpc_ext (d
))
19539 else if (aarch64_evpc_dup (d
))
19541 else if (aarch64_evpc_zip (d
))
19543 else if (aarch64_evpc_uzp (d
))
19545 else if (aarch64_evpc_trn (d
))
19547 else if (aarch64_evpc_sel (d
))
19549 if (d
->vec_flags
== VEC_SVE_DATA
)
19550 return aarch64_evpc_sve_tbl (d
);
19551 else if (d
->vec_flags
== VEC_ADVSIMD
)
19552 return aarch64_evpc_tbl (d
);
19557 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19560 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
19561 rtx op1
, const vec_perm_indices
&sel
)
19563 struct expand_vec_perm_d d
;
19565 /* Check whether the mask can be applied to a single vector. */
19566 if (sel
.ninputs () == 1
19567 || (op0
&& rtx_equal_p (op0
, op1
)))
19568 d
.one_vector_p
= true;
19569 else if (sel
.all_from_input_p (0))
19571 d
.one_vector_p
= true;
19574 else if (sel
.all_from_input_p (1))
19576 d
.one_vector_p
= true;
19580 d
.one_vector_p
= false;
19582 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
19583 sel
.nelts_per_input ());
19585 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
19589 d
.testing_p
= !target
;
19592 return aarch64_expand_vec_perm_const_1 (&d
);
19594 rtx_insn
*last
= get_last_insn ();
19595 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
19596 gcc_assert (last
== get_last_insn ());
19601 /* Generate a byte permute mask for a register of mode MODE,
19602 which has NUNITS units. */
19605 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
19607 /* We have to reverse each vector because we dont have
19608 a permuted load that can reverse-load according to ABI rules. */
19610 rtvec v
= rtvec_alloc (16);
19612 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
19614 gcc_assert (BYTES_BIG_ENDIAN
);
19615 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
19617 for (i
= 0; i
< nunits
; i
++)
19618 for (j
= 0; j
< usize
; j
++)
19619 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
19620 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
19621 return force_reg (V16QImode
, mask
);
19624 /* Expand an SVE integer comparison using the SVE equivalent of:
19626 (set TARGET (CODE OP0 OP1)). */
19629 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
19631 machine_mode pred_mode
= GET_MODE (target
);
19632 machine_mode data_mode
= GET_MODE (op0
);
19633 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
19635 if (!rtx_equal_p (target
, res
))
19636 emit_move_insn (target
, res
);
19639 /* Return the UNSPEC_COND_* code for comparison CODE. */
19641 static unsigned int
19642 aarch64_unspec_cond_code (rtx_code code
)
19647 return UNSPEC_COND_FCMNE
;
19649 return UNSPEC_COND_FCMEQ
;
19651 return UNSPEC_COND_FCMLT
;
19653 return UNSPEC_COND_FCMGT
;
19655 return UNSPEC_COND_FCMLE
;
19657 return UNSPEC_COND_FCMGE
;
19659 return UNSPEC_COND_FCMUO
;
19661 gcc_unreachable ();
19667 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19669 where <X> is the operation associated with comparison CODE.
19670 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19673 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
19674 bool known_ptrue_p
, rtx op0
, rtx op1
)
19676 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
19677 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
19678 gen_rtvec (4, pred
, flag
, op0
, op1
),
19679 aarch64_unspec_cond_code (code
));
19680 emit_set_insn (target
, unspec
);
19683 /* Emit the SVE equivalent of:
19685 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19686 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19687 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19689 where <Xi> is the operation associated with comparison CODEi.
19690 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19693 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
19694 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
19696 machine_mode pred_mode
= GET_MODE (pred
);
19697 rtx tmp1
= gen_reg_rtx (pred_mode
);
19698 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
19699 rtx tmp2
= gen_reg_rtx (pred_mode
);
19700 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
19701 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
19704 /* Emit the SVE equivalent of:
19706 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19707 (set TARGET (not TMP))
19709 where <X> is the operation associated with comparison CODE.
19710 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19713 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
19714 bool known_ptrue_p
, rtx op0
, rtx op1
)
19716 machine_mode pred_mode
= GET_MODE (pred
);
19717 rtx tmp
= gen_reg_rtx (pred_mode
);
19718 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
19719 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
19722 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19724 (set TARGET (CODE OP0 OP1))
19726 If CAN_INVERT_P is true, the caller can also handle inverted results;
19727 return true if the result is in fact inverted. */
19730 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
19731 rtx op0
, rtx op1
, bool can_invert_p
)
19733 machine_mode pred_mode
= GET_MODE (target
);
19734 machine_mode data_mode
= GET_MODE (op0
);
19736 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
19740 /* UNORDERED has no immediate form. */
19741 op1
= force_reg (data_mode
, op1
);
19750 /* There is native support for the comparison. */
19751 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19756 /* This is a trapping operation (LT or GT). */
19757 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
19761 if (!flag_trapping_math
)
19763 /* This would trap for signaling NaNs. */
19764 op1
= force_reg (data_mode
, op1
);
19765 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
19766 ptrue
, true, op0
, op1
);
19774 if (flag_trapping_math
)
19776 /* Work out which elements are ordered. */
19777 rtx ordered
= gen_reg_rtx (pred_mode
);
19778 op1
= force_reg (data_mode
, op1
);
19779 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
19780 ptrue
, true, op0
, op1
);
19782 /* Test the opposite condition for the ordered elements,
19783 then invert the result. */
19787 code
= reverse_condition_maybe_unordered (code
);
19790 aarch64_emit_sve_fp_cond (target
, code
,
19791 ordered
, false, op0
, op1
);
19794 aarch64_emit_sve_invert_fp_cond (target
, code
,
19795 ordered
, false, op0
, op1
);
19801 /* ORDERED has no immediate form. */
19802 op1
= force_reg (data_mode
, op1
);
19806 gcc_unreachable ();
19809 /* There is native support for the inverse comparison. */
19810 code
= reverse_condition_maybe_unordered (code
);
19813 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19816 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
19820 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19821 of the data being selected and CMP_MODE is the mode of the values being
19825 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
19828 machine_mode pred_mode
= aarch64_get_mask_mode (cmp_mode
).require ();
19829 rtx pred
= gen_reg_rtx (pred_mode
);
19830 if (FLOAT_MODE_P (cmp_mode
))
19832 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
19833 ops
[4], ops
[5], true))
19834 std::swap (ops
[1], ops
[2]);
19837 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
19839 if (!aarch64_sve_reg_or_dup_imm (ops
[1], data_mode
))
19840 ops
[1] = force_reg (data_mode
, ops
[1]);
19841 /* The "false" value can only be zero if the "true" value is a constant. */
19842 if (register_operand (ops
[1], data_mode
)
19843 || !aarch64_simd_reg_or_zero (ops
[2], data_mode
))
19844 ops
[2] = force_reg (data_mode
, ops
[2]);
19846 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
19847 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
19850 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
19851 true. However due to issues with register allocation it is preferable
19852 to avoid tieing integer scalar and FP scalar modes. Executing integer
19853 operations in general registers is better than treating them as scalar
19854 vector operations. This reduces latency and avoids redundant int<->FP
19855 moves. So tie modes if they are either the same class, or vector modes
19856 with other vector modes, vector structs or any scalar mode. */
19859 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
19861 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
19864 /* We specifically want to allow elements of "structure" modes to
19865 be tieable to the structure. This more general condition allows
19866 other rarer situations too. The reason we don't extend this to
19867 predicate modes is that there are no predicate structure modes
19868 nor any specific instructions for extracting part of a predicate
19870 if (aarch64_vector_data_mode_p (mode1
)
19871 && aarch64_vector_data_mode_p (mode2
))
19874 /* Also allow any scalar modes with vectors. */
19875 if (aarch64_vector_mode_supported_p (mode1
)
19876 || aarch64_vector_mode_supported_p (mode2
))
19882 /* Return a new RTX holding the result of moving POINTER forward by
19886 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
19888 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
19890 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
19894 /* Return a new RTX holding the result of moving POINTER forward by the
19895 size of the mode it points to. */
19898 aarch64_progress_pointer (rtx pointer
)
19900 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
19903 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19907 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
19910 rtx reg
= gen_reg_rtx (mode
);
19912 /* "Cast" the pointers to the correct mode. */
19913 *src
= adjust_address (*src
, mode
, 0);
19914 *dst
= adjust_address (*dst
, mode
, 0);
19915 /* Emit the memcpy. */
19916 emit_move_insn (reg
, *src
);
19917 emit_move_insn (*dst
, reg
);
19918 /* Move the pointers forward. */
19919 *src
= aarch64_progress_pointer (*src
);
19920 *dst
= aarch64_progress_pointer (*dst
);
19923 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
19924 we succeed, otherwise return false. */
19927 aarch64_expand_cpymem (rtx
*operands
)
19930 rtx dst
= operands
[0];
19931 rtx src
= operands
[1];
19933 machine_mode cur_mode
= BLKmode
, next_mode
;
19934 bool speed_p
= !optimize_function_for_size_p (cfun
);
19936 /* When optimizing for size, give a better estimate of the length of a
19937 memcpy call, but use the default otherwise. Moves larger than 8 bytes
19938 will always require an even number of instructions to do now. And each
19939 operation requires both a load+store, so devide the max number by 2. */
19940 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
19942 /* We can't do anything smart if the amount to copy is not constant. */
19943 if (!CONST_INT_P (operands
[2]))
19946 n
= INTVAL (operands
[2]);
19948 /* Try to keep the number of instructions low. For all cases we will do at
19949 most two moves for the residual amount, since we'll always overlap the
19951 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
19954 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
19955 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
19957 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
19958 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
19960 /* Convert n to bits to make the rest of the code simpler. */
19961 n
= n
* BITS_PER_UNIT
;
19963 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
19964 larger than TImode, but we should not use them for loads/stores here. */
19965 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
19969 /* Find the largest mode in which to do the copy in without over reading
19971 opt_scalar_int_mode mode_iter
;
19972 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
19973 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
19974 cur_mode
= mode_iter
.require ();
19976 gcc_assert (cur_mode
!= BLKmode
);
19978 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
19979 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
19983 /* Do certain trailing copies as overlapping if it's going to be
19984 cheaper. i.e. less instructions to do so. For instance doing a 15
19985 byte copy it's more efficient to do two overlapping 8 byte copies than
19987 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
19989 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
19990 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
19991 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
19992 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
20000 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20001 SImode stores. Handle the case when the constant has identical
20002 bottom and top halves. This is beneficial when the two stores can be
20003 merged into an STP and we avoid synthesising potentially expensive
20004 immediates twice. Return true if such a split is possible. */
20007 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
20009 rtx lo
= gen_lowpart (SImode
, src
);
20010 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
20012 bool size_p
= optimize_function_for_size_p (cfun
);
20014 if (!rtx_equal_p (lo
, hi
))
20017 unsigned int orig_cost
20018 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
20019 unsigned int lo_cost
20020 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
20022 /* We want to transform:
20024 MOVK x1, 0x140, lsl 16
20025 MOVK x1, 0xc0da, lsl 32
20026 MOVK x1, 0x140, lsl 48
20030 MOVK w1, 0x140, lsl 16
20032 So we want to perform this only when we save two instructions
20033 or more. When optimizing for size, however, accept any code size
20035 if (size_p
&& orig_cost
<= lo_cost
)
20039 && (orig_cost
<= lo_cost
+ 1))
20042 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
20043 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
20046 rtx tmp_reg
= gen_reg_rtx (SImode
);
20047 aarch64_expand_mov_immediate (tmp_reg
, lo
);
20048 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
20049 /* Don't emit an explicit store pair as this may not be always profitable.
20050 Let the sched-fusion logic decide whether to merge them. */
20051 emit_move_insn (mem_lo
, tmp_reg
);
20052 emit_move_insn (mem_hi
, tmp_reg
);
20057 /* Generate RTL for a conditional branch with rtx comparison CODE in
20058 mode CC_MODE. The destination of the unlikely conditional branch
20062 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
20066 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
20067 gen_rtx_REG (cc_mode
, CC_REGNUM
),
20070 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
20071 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
20073 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
20076 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20078 OP1 represents the TImode destination operand 1
20079 OP2 represents the TImode destination operand 2
20080 LOW_DEST represents the low half (DImode) of TImode operand 0
20081 LOW_IN1 represents the low half (DImode) of TImode operand 1
20082 LOW_IN2 represents the low half (DImode) of TImode operand 2
20083 HIGH_DEST represents the high half (DImode) of TImode operand 0
20084 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20085 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20088 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20089 rtx
*low_in1
, rtx
*low_in2
,
20090 rtx
*high_dest
, rtx
*high_in1
,
20093 *low_dest
= gen_reg_rtx (DImode
);
20094 *low_in1
= gen_lowpart (DImode
, op1
);
20095 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20096 subreg_lowpart_offset (DImode
, TImode
));
20097 *high_dest
= gen_reg_rtx (DImode
);
20098 *high_in1
= gen_highpart (DImode
, op1
);
20099 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20100 subreg_highpart_offset (DImode
, TImode
));
20103 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20105 This function differs from 'arch64_addti_scratch_regs' in that
20106 OP1 can be an immediate constant (zero). We must call
20107 subreg_highpart_offset with DImode and TImode arguments, otherwise
20108 VOIDmode will be used for the const_int which generates an internal
20109 error from subreg_size_highpart_offset which does not expect a size of zero.
20111 OP1 represents the TImode destination operand 1
20112 OP2 represents the TImode destination operand 2
20113 LOW_DEST represents the low half (DImode) of TImode operand 0
20114 LOW_IN1 represents the low half (DImode) of TImode operand 1
20115 LOW_IN2 represents the low half (DImode) of TImode operand 2
20116 HIGH_DEST represents the high half (DImode) of TImode operand 0
20117 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20118 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20122 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
20123 rtx
*low_in1
, rtx
*low_in2
,
20124 rtx
*high_dest
, rtx
*high_in1
,
20127 *low_dest
= gen_reg_rtx (DImode
);
20128 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20129 subreg_lowpart_offset (DImode
, TImode
));
20131 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20132 subreg_lowpart_offset (DImode
, TImode
));
20133 *high_dest
= gen_reg_rtx (DImode
);
20135 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
20136 subreg_highpart_offset (DImode
, TImode
));
20137 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
20138 subreg_highpart_offset (DImode
, TImode
));
20141 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20143 OP0 represents the TImode destination operand 0
20144 LOW_DEST represents the low half (DImode) of TImode operand 0
20145 LOW_IN1 represents the low half (DImode) of TImode operand 1
20146 LOW_IN2 represents the low half (DImode) of TImode operand 2
20147 HIGH_DEST represents the high half (DImode) of TImode operand 0
20148 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20149 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20150 UNSIGNED_P is true if the operation is being performed on unsigned
20153 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
20154 rtx low_in2
, rtx high_dest
, rtx high_in1
,
20155 rtx high_in2
, bool unsigned_p
)
20157 if (low_in2
== const0_rtx
)
20159 low_dest
= low_in1
;
20160 high_in2
= force_reg (DImode
, high_in2
);
20162 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
20164 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
20168 if (CONST_INT_P (low_in2
))
20170 high_in2
= force_reg (DImode
, high_in2
);
20171 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
20172 GEN_INT (-INTVAL (low_in2
))));
20175 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
20178 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
20180 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
20183 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
20184 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
20188 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20190 static unsigned HOST_WIDE_INT
20191 aarch64_asan_shadow_offset (void)
20194 return (HOST_WIDE_INT_1
<< 29);
20196 return (HOST_WIDE_INT_1
<< 36);
20200 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
20201 int code
, tree treeop0
, tree treeop1
)
20203 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20205 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20207 struct expand_operand ops
[4];
20210 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20212 op_mode
= GET_MODE (op0
);
20213 if (op_mode
== VOIDmode
)
20214 op_mode
= GET_MODE (op1
);
20222 icode
= CODE_FOR_cmpsi
;
20227 icode
= CODE_FOR_cmpdi
;
20232 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20233 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
20238 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
20239 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
20247 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
20248 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
20254 *prep_seq
= get_insns ();
20257 create_fixed_operand (&ops
[0], op0
);
20258 create_fixed_operand (&ops
[1], op1
);
20261 if (!maybe_expand_insn (icode
, 2, ops
))
20266 *gen_seq
= get_insns ();
20269 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
20270 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
20274 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
20275 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
20277 rtx op0
, op1
, target
;
20278 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
20279 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
20281 struct expand_operand ops
[6];
20284 push_to_sequence (*prep_seq
);
20285 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
20287 op_mode
= GET_MODE (op0
);
20288 if (op_mode
== VOIDmode
)
20289 op_mode
= GET_MODE (op1
);
20305 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20310 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
20318 icode
= code_for_ccmp (cc_mode
, cmp_mode
);
20320 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
20321 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
20327 *prep_seq
= get_insns ();
20330 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
20331 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
20333 if (bit_code
!= AND
)
20335 /* Treat the ccmp patterns as canonical and use them where possible,
20336 but fall back to ccmp_rev patterns if there's no other option. */
20337 rtx_code prev_code
= GET_CODE (prev
);
20338 machine_mode prev_mode
= GET_MODE (XEXP (prev
, 0));
20339 if ((prev_mode
== CCFPmode
|| prev_mode
== CCFPEmode
)
20340 && !(prev_code
== EQ
20342 || prev_code
== ORDERED
20343 || prev_code
== UNORDERED
))
20344 icode
= code_for_ccmp_rev (cc_mode
, cmp_mode
);
20347 rtx_code code
= reverse_condition (prev_code
);
20348 prev
= gen_rtx_fmt_ee (code
, VOIDmode
, XEXP (prev
, 0), const0_rtx
);
20350 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
20353 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
20354 create_fixed_operand (&ops
[1], target
);
20355 create_fixed_operand (&ops
[2], op0
);
20356 create_fixed_operand (&ops
[3], op1
);
20357 create_fixed_operand (&ops
[4], prev
);
20358 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
20360 push_to_sequence (*gen_seq
);
20361 if (!maybe_expand_insn (icode
, 6, ops
))
20367 *gen_seq
= get_insns ();
20370 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
20373 #undef TARGET_GEN_CCMP_FIRST
20374 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20376 #undef TARGET_GEN_CCMP_NEXT
20377 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20379 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
20380 instruction fusion of some sort. */
20383 aarch64_macro_fusion_p (void)
20385 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
20389 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20390 should be kept together during scheduling. */
20393 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
20396 rtx prev_set
= single_set (prev
);
20397 rtx curr_set
= single_set (curr
);
20398 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20399 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
20401 if (!aarch64_macro_fusion_p ())
20404 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
20406 /* We are trying to match:
20407 prev (mov) == (set (reg r0) (const_int imm16))
20408 curr (movk) == (set (zero_extract (reg r0)
20411 (const_int imm16_1)) */
20413 set_dest
= SET_DEST (curr_set
);
20415 if (GET_CODE (set_dest
) == ZERO_EXTRACT
20416 && CONST_INT_P (SET_SRC (curr_set
))
20417 && CONST_INT_P (SET_SRC (prev_set
))
20418 && CONST_INT_P (XEXP (set_dest
, 2))
20419 && INTVAL (XEXP (set_dest
, 2)) == 16
20420 && REG_P (XEXP (set_dest
, 0))
20421 && REG_P (SET_DEST (prev_set
))
20422 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
20428 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
20431 /* We're trying to match:
20432 prev (adrp) == (set (reg r1)
20433 (high (symbol_ref ("SYM"))))
20434 curr (add) == (set (reg r0)
20436 (symbol_ref ("SYM"))))
20437 Note that r0 need not necessarily be the same as r1, especially
20438 during pre-regalloc scheduling. */
20440 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
20441 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
20443 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
20444 && REG_P (XEXP (SET_SRC (curr_set
), 0))
20445 && REGNO (XEXP (SET_SRC (curr_set
), 0))
20446 == REGNO (SET_DEST (prev_set
))
20447 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
20448 XEXP (SET_SRC (curr_set
), 1)))
20453 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
20456 /* We're trying to match:
20457 prev (movk) == (set (zero_extract (reg r0)
20460 (const_int imm16_1))
20461 curr (movk) == (set (zero_extract (reg r0)
20464 (const_int imm16_2)) */
20466 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
20467 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
20468 && REG_P (XEXP (SET_DEST (prev_set
), 0))
20469 && REG_P (XEXP (SET_DEST (curr_set
), 0))
20470 && REGNO (XEXP (SET_DEST (prev_set
), 0))
20471 == REGNO (XEXP (SET_DEST (curr_set
), 0))
20472 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
20473 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
20474 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
20475 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
20476 && CONST_INT_P (SET_SRC (prev_set
))
20477 && CONST_INT_P (SET_SRC (curr_set
)))
20481 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
20483 /* We're trying to match:
20484 prev (adrp) == (set (reg r0)
20485 (high (symbol_ref ("SYM"))))
20486 curr (ldr) == (set (reg r1)
20487 (mem (lo_sum (reg r0)
20488 (symbol_ref ("SYM")))))
20490 curr (ldr) == (set (reg r1)
20493 (symbol_ref ("SYM")))))) */
20494 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
20495 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
20497 rtx curr_src
= SET_SRC (curr_set
);
20499 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
20500 curr_src
= XEXP (curr_src
, 0);
20502 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
20503 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
20504 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
20505 == REGNO (SET_DEST (prev_set
))
20506 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
20507 XEXP (SET_SRC (prev_set
), 0)))
20512 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
20513 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
20514 && prev_set
&& curr_set
&& any_condjump_p (curr
)
20515 && GET_CODE (SET_SRC (prev_set
)) == COMPARE
20516 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set
), 0)))
20517 && reg_referenced_p (SET_DEST (prev_set
), PATTERN (curr
)))
20520 /* Fuse flag-setting ALU instructions and conditional branch. */
20521 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
20522 && any_condjump_p (curr
))
20524 unsigned int condreg1
, condreg2
;
20526 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
20527 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
20529 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
20531 && modified_in_p (cc_reg_1
, prev
))
20533 enum attr_type prev_type
= get_attr_type (prev
);
20535 /* FIXME: this misses some which is considered simple arthematic
20536 instructions for ThunderX. Simple shifts are missed here. */
20537 if (prev_type
== TYPE_ALUS_SREG
20538 || prev_type
== TYPE_ALUS_IMM
20539 || prev_type
== TYPE_LOGICS_REG
20540 || prev_type
== TYPE_LOGICS_IMM
)
20545 /* Fuse ALU instructions and CBZ/CBNZ. */
20548 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ
)
20549 && any_condjump_p (curr
))
20551 /* We're trying to match:
20552 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20553 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20555 (label_ref ("SYM"))
20557 if (SET_DEST (curr_set
) == (pc_rtx
)
20558 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
20559 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
20560 && REG_P (SET_DEST (prev_set
))
20561 && REGNO (SET_DEST (prev_set
))
20562 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
20564 /* Fuse ALU operations followed by conditional branch instruction. */
20565 switch (get_attr_type (prev
))
20568 case TYPE_ALU_SREG
:
20571 case TYPE_ADCS_REG
:
20572 case TYPE_ADCS_IMM
:
20573 case TYPE_LOGIC_REG
:
20574 case TYPE_LOGIC_IMM
:
20578 case TYPE_SHIFT_REG
:
20579 case TYPE_SHIFT_IMM
:
20594 /* Return true iff the instruction fusion described by OP is enabled. */
20597 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
20599 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
20602 /* If MEM is in the form of [base+offset], extract the two parts
20603 of address and set to BASE and OFFSET, otherwise return false
20604 after clearing BASE and OFFSET. */
20607 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
20611 gcc_assert (MEM_P (mem
));
20613 addr
= XEXP (mem
, 0);
20618 *offset
= const0_rtx
;
20622 if (GET_CODE (addr
) == PLUS
20623 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
20625 *base
= XEXP (addr
, 0);
20626 *offset
= XEXP (addr
, 1);
20631 *offset
= NULL_RTX
;
20636 /* Types for scheduling fusion. */
20637 enum sched_fusion_type
20639 SCHED_FUSION_NONE
= 0,
20640 SCHED_FUSION_LD_SIGN_EXTEND
,
20641 SCHED_FUSION_LD_ZERO_EXTEND
,
20647 /* If INSN is a load or store of address in the form of [base+offset],
20648 extract the two parts and set to BASE and OFFSET. Return scheduling
20649 fusion type this INSN is. */
20651 static enum sched_fusion_type
20652 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
20655 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
20657 gcc_assert (INSN_P (insn
));
20658 x
= PATTERN (insn
);
20659 if (GET_CODE (x
) != SET
)
20660 return SCHED_FUSION_NONE
;
20663 dest
= SET_DEST (x
);
20665 machine_mode dest_mode
= GET_MODE (dest
);
20667 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
20668 return SCHED_FUSION_NONE
;
20670 if (GET_CODE (src
) == SIGN_EXTEND
)
20672 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
20673 src
= XEXP (src
, 0);
20674 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
20675 return SCHED_FUSION_NONE
;
20677 else if (GET_CODE (src
) == ZERO_EXTEND
)
20679 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
20680 src
= XEXP (src
, 0);
20681 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
20682 return SCHED_FUSION_NONE
;
20685 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
20686 extract_base_offset_in_addr (src
, base
, offset
);
20687 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
20689 fusion
= SCHED_FUSION_ST
;
20690 extract_base_offset_in_addr (dest
, base
, offset
);
20693 return SCHED_FUSION_NONE
;
20695 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
20696 fusion
= SCHED_FUSION_NONE
;
20701 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20703 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20704 and PRI are only calculated for these instructions. For other instruction,
20705 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20706 type instruction fusion can be added by returning different priorities.
20708 It's important that irrelevant instructions get the largest FUSION_PRI. */
20711 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
20712 int *fusion_pri
, int *pri
)
20716 enum sched_fusion_type fusion
;
20718 gcc_assert (INSN_P (insn
));
20721 fusion
= fusion_load_store (insn
, &base
, &offset
);
20722 if (fusion
== SCHED_FUSION_NONE
)
20729 /* Set FUSION_PRI according to fusion type and base register. */
20730 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
20732 /* Calculate PRI. */
20735 /* INSN with smaller offset goes first. */
20736 off_val
= (int)(INTVAL (offset
));
20738 tmp
-= (off_val
& 0xfffff);
20740 tmp
+= ((- off_val
) & 0xfffff);
20746 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20747 Adjust priority of sha1h instructions so they are scheduled before
20748 other SHA1 instructions. */
20751 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
20753 rtx x
= PATTERN (insn
);
20755 if (GET_CODE (x
) == SET
)
20759 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
20760 return priority
+ 10;
20766 /* Given OPERANDS of consecutive load/store, check if we can merge
20767 them into ldp/stp. LOAD is true if they are load instructions.
20768 MODE is the mode of memory operands. */
20771 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
20774 HOST_WIDE_INT offval_1
, offval_2
, msize
;
20775 enum reg_class rclass_1
, rclass_2
;
20776 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
20780 mem_1
= operands
[1];
20781 mem_2
= operands
[3];
20782 reg_1
= operands
[0];
20783 reg_2
= operands
[2];
20784 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
20785 if (REGNO (reg_1
) == REGNO (reg_2
))
20790 mem_1
= operands
[0];
20791 mem_2
= operands
[2];
20792 reg_1
= operands
[1];
20793 reg_2
= operands
[3];
20796 /* The mems cannot be volatile. */
20797 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
20800 /* If we have SImode and slow unaligned ldp,
20801 check the alignment to be at least 8 byte. */
20803 && (aarch64_tune_params
.extra_tuning_flags
20804 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
20806 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
20809 /* Check if the addresses are in the form of [base+offset]. */
20810 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
20811 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
20813 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
20814 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
20817 /* Check if the bases are same. */
20818 if (!rtx_equal_p (base_1
, base_2
))
20821 /* The operands must be of the same size. */
20822 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
20823 GET_MODE_SIZE (GET_MODE (mem_2
))));
20825 offval_1
= INTVAL (offset_1
);
20826 offval_2
= INTVAL (offset_2
);
20827 /* We should only be trying this for fixed-sized modes. There is no
20828 SVE LDP/STP instruction. */
20829 msize
= GET_MODE_SIZE (mode
).to_constant ();
20830 /* Check if the offsets are consecutive. */
20831 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
20834 /* Check if the addresses are clobbered by load. */
20837 if (reg_mentioned_p (reg_1
, mem_1
))
20840 /* In increasing order, the last load can clobber the address. */
20841 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
20845 /* One of the memory accesses must be a mempair operand.
20846 If it is not the first one, they need to be swapped by the
20848 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
20849 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
20852 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
20853 rclass_1
= FP_REGS
;
20855 rclass_1
= GENERAL_REGS
;
20857 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
20858 rclass_2
= FP_REGS
;
20860 rclass_2
= GENERAL_REGS
;
20862 /* Check if the registers are of same class. */
20863 if (rclass_1
!= rclass_2
)
20869 /* Given OPERANDS of consecutive load/store that can be merged,
20870 swap them if they are not in ascending order. */
20872 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
20874 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
20875 HOST_WIDE_INT offval_1
, offval_2
;
20879 mem_1
= operands
[1];
20880 mem_2
= operands
[3];
20884 mem_1
= operands
[0];
20885 mem_2
= operands
[2];
20888 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
20889 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
20891 offval_1
= INTVAL (offset_1
);
20892 offval_2
= INTVAL (offset_2
);
20894 if (offval_1
> offval_2
)
20896 /* Irrespective of whether this is a load or a store,
20897 we do the same swap. */
20898 std::swap (operands
[0], operands
[2]);
20899 std::swap (operands
[1], operands
[3]);
20903 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20904 comparison between the two. */
20906 aarch64_host_wide_int_compare (const void *x
, const void *y
)
20908 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
20909 * ((const HOST_WIDE_INT
*) y
));
20912 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20913 other pointing to a REG rtx containing an offset, compare the offsets
20918 1 iff offset (X) > offset (Y)
20919 0 iff offset (X) == offset (Y)
20920 -1 iff offset (X) < offset (Y) */
20922 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
20924 const rtx
* operands_1
= (const rtx
*) x
;
20925 const rtx
* operands_2
= (const rtx
*) y
;
20926 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
20928 if (MEM_P (operands_1
[0]))
20929 mem_1
= operands_1
[0];
20931 mem_1
= operands_1
[1];
20933 if (MEM_P (operands_2
[0]))
20934 mem_2
= operands_2
[0];
20936 mem_2
= operands_2
[1];
20938 /* Extract the offsets. */
20939 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
20940 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
20942 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
20944 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
20947 /* Given OPERANDS of consecutive load/store, check if we can merge
20948 them into ldp/stp by adjusting the offset. LOAD is true if they
20949 are load instructions. MODE is the mode of memory operands.
20951 Given below consecutive stores:
20953 str w1, [xb, 0x100]
20954 str w1, [xb, 0x104]
20955 str w1, [xb, 0x108]
20956 str w1, [xb, 0x10c]
20958 Though the offsets are out of the range supported by stp, we can
20959 still pair them after adjusting the offset, like:
20961 add scratch, xb, 0x100
20962 stp w1, w1, [scratch]
20963 stp w1, w1, [scratch, 0x8]
20965 The peephole patterns detecting this opportunity should guarantee
20966 the scratch register is avaliable. */
20969 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
20972 const int num_insns
= 4;
20973 enum reg_class rclass
;
20974 HOST_WIDE_INT offvals
[num_insns
], msize
;
20975 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
20979 for (int i
= 0; i
< num_insns
; i
++)
20981 reg
[i
] = operands
[2 * i
];
20982 mem
[i
] = operands
[2 * i
+ 1];
20984 gcc_assert (REG_P (reg
[i
]));
20987 /* Do not attempt to merge the loads if the loads clobber each other. */
20988 for (int i
= 0; i
< 8; i
+= 2)
20989 for (int j
= i
+ 2; j
< 8; j
+= 2)
20990 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
20994 for (int i
= 0; i
< num_insns
; i
++)
20996 mem
[i
] = operands
[2 * i
];
20997 reg
[i
] = operands
[2 * i
+ 1];
21000 /* Skip if memory operand is by itself valid for ldp/stp. */
21001 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
21004 for (int i
= 0; i
< num_insns
; i
++)
21006 /* The mems cannot be volatile. */
21007 if (MEM_VOLATILE_P (mem
[i
]))
21010 /* Check if the addresses are in the form of [base+offset]. */
21011 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
21012 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
21016 /* Check if the registers are of same class. */
21017 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
21018 ? FP_REGS
: GENERAL_REGS
;
21020 for (int i
= 1; i
< num_insns
; i
++)
21021 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
21023 if (rclass
!= FP_REGS
)
21028 if (rclass
!= GENERAL_REGS
)
21032 /* Only the last register in the order in which they occur
21033 may be clobbered by the load. */
21034 if (rclass
== GENERAL_REGS
&& load
)
21035 for (int i
= 0; i
< num_insns
- 1; i
++)
21036 if (reg_mentioned_p (reg
[i
], mem
[i
]))
21039 /* Check if the bases are same. */
21040 for (int i
= 0; i
< num_insns
- 1; i
++)
21041 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
21044 for (int i
= 0; i
< num_insns
; i
++)
21045 offvals
[i
] = INTVAL (offset
[i
]);
21047 msize
= GET_MODE_SIZE (mode
);
21049 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21050 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
21051 aarch64_host_wide_int_compare
);
21053 if (!(offvals
[1] == offvals
[0] + msize
21054 && offvals
[3] == offvals
[2] + msize
))
21057 /* Check that offsets are within range of each other. The ldp/stp
21058 instructions have 7 bit immediate offsets, so use 0x80. */
21059 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
21062 /* The offsets must be aligned with respect to each other. */
21063 if (offvals
[0] % msize
!= offvals
[2] % msize
)
21066 /* If we have SImode and slow unaligned ldp,
21067 check the alignment to be at least 8 byte. */
21069 && (aarch64_tune_params
.extra_tuning_flags
21070 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
21072 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
21078 /* Given OPERANDS of consecutive load/store, this function pairs them
21079 into LDP/STP after adjusting the offset. It depends on the fact
21080 that the operands can be sorted so the offsets are correct for STP.
21081 MODE is the mode of memory operands. CODE is the rtl operator
21082 which should be applied to all memory operands, it's SIGN_EXTEND,
21083 ZERO_EXTEND or UNKNOWN. */
21086 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
21087 scalar_mode mode
, RTX_CODE code
)
21089 rtx base
, offset_1
, offset_3
, t1
, t2
;
21090 rtx mem_1
, mem_2
, mem_3
, mem_4
;
21091 rtx temp_operands
[8];
21092 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
21093 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
21095 /* We make changes on a copy as we may still bail out. */
21096 for (int i
= 0; i
< 8; i
++)
21097 temp_operands
[i
] = operands
[i
];
21099 /* Sort the operands. */
21100 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
21102 /* Copy the memory operands so that if we have to bail for some
21103 reason the original addresses are unchanged. */
21106 mem_1
= copy_rtx (temp_operands
[1]);
21107 mem_2
= copy_rtx (temp_operands
[3]);
21108 mem_3
= copy_rtx (temp_operands
[5]);
21109 mem_4
= copy_rtx (temp_operands
[7]);
21113 mem_1
= copy_rtx (temp_operands
[0]);
21114 mem_2
= copy_rtx (temp_operands
[2]);
21115 mem_3
= copy_rtx (temp_operands
[4]);
21116 mem_4
= copy_rtx (temp_operands
[6]);
21117 gcc_assert (code
== UNKNOWN
);
21120 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
21121 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
21122 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
21123 && offset_3
!= NULL_RTX
);
21125 /* Adjust offset so it can fit in LDP/STP instruction. */
21126 msize
= GET_MODE_SIZE (mode
);
21127 stp_off_upper_limit
= msize
* (0x40 - 1);
21128 stp_off_lower_limit
= - msize
* 0x40;
21130 off_val_1
= INTVAL (offset_1
);
21131 off_val_3
= INTVAL (offset_3
);
21133 /* The base offset is optimally half way between the two STP/LDP offsets. */
21135 base_off
= (off_val_1
+ off_val_3
) / 2;
21137 /* However, due to issues with negative LDP/STP offset generation for
21138 larger modes, for DF, DI and vector modes. we must not use negative
21139 addresses smaller than 9 signed unadjusted bits can store. This
21140 provides the most range in this case. */
21141 base_off
= off_val_1
;
21143 /* Adjust the base so that it is aligned with the addresses but still
21145 if (base_off
% msize
!= off_val_1
% msize
)
21146 /* Fix the offset, bearing in mind we want to make it bigger not
21148 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21149 else if (msize
<= 4)
21150 /* The negative range of LDP/STP is one larger than the positive range. */
21153 /* Check if base offset is too big or too small. We can attempt to resolve
21154 this issue by setting it to the maximum value and seeing if the offsets
21156 if (base_off
>= 0x1000)
21158 base_off
= 0x1000 - 1;
21159 /* We must still make sure that the base offset is aligned with respect
21160 to the address. But it may may not be made any bigger. */
21161 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21164 /* Likewise for the case where the base is too small. */
21165 if (base_off
<= -0x1000)
21167 base_off
= -0x1000 + 1;
21168 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
21171 /* Offset of the first STP/LDP. */
21172 new_off_1
= off_val_1
- base_off
;
21174 /* Offset of the second STP/LDP. */
21175 new_off_3
= off_val_3
- base_off
;
21177 /* The offsets must be within the range of the LDP/STP instructions. */
21178 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
21179 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
21182 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
21184 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
21185 new_off_1
+ msize
), true);
21186 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
21188 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
21189 new_off_3
+ msize
), true);
21191 if (!aarch64_mem_pair_operand (mem_1
, mode
)
21192 || !aarch64_mem_pair_operand (mem_3
, mode
))
21195 if (code
== ZERO_EXTEND
)
21197 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
21198 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
21199 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
21200 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
21202 else if (code
== SIGN_EXTEND
)
21204 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
21205 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
21206 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
21207 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
21212 operands
[0] = temp_operands
[0];
21213 operands
[1] = mem_1
;
21214 operands
[2] = temp_operands
[2];
21215 operands
[3] = mem_2
;
21216 operands
[4] = temp_operands
[4];
21217 operands
[5] = mem_3
;
21218 operands
[6] = temp_operands
[6];
21219 operands
[7] = mem_4
;
21223 operands
[0] = mem_1
;
21224 operands
[1] = temp_operands
[1];
21225 operands
[2] = mem_2
;
21226 operands
[3] = temp_operands
[3];
21227 operands
[4] = mem_3
;
21228 operands
[5] = temp_operands
[5];
21229 operands
[6] = mem_4
;
21230 operands
[7] = temp_operands
[7];
21233 /* Emit adjusting instruction. */
21234 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
21235 /* Emit ldp/stp instructions. */
21236 t1
= gen_rtx_SET (operands
[0], operands
[1]);
21237 t2
= gen_rtx_SET (operands
[2], operands
[3]);
21238 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21239 t1
= gen_rtx_SET (operands
[4], operands
[5]);
21240 t2
= gen_rtx_SET (operands
[6], operands
[7]);
21241 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
21245 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21246 it isn't worth branching around empty masked ops (including masked
21250 aarch64_empty_mask_is_expensive (unsigned)
21255 /* Return 1 if pseudo register should be created and used to hold
21256 GOT address for PIC code. */
21259 aarch64_use_pseudo_pic_reg (void)
21261 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
21264 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21267 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
21269 switch (XINT (x
, 1))
21271 case UNSPEC_GOTSMALLPIC
:
21272 case UNSPEC_GOTSMALLPIC28K
:
21273 case UNSPEC_GOTTINYPIC
:
21279 return default_unspec_may_trap_p (x
, flags
);
21283 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21284 return the log2 of that value. Otherwise return -1. */
21287 aarch64_fpconst_pow_of_2 (rtx x
)
21289 const REAL_VALUE_TYPE
*r
;
21291 if (!CONST_DOUBLE_P (x
))
21294 r
= CONST_DOUBLE_REAL_VALUE (x
);
21296 if (REAL_VALUE_NEGATIVE (*r
)
21297 || REAL_VALUE_ISNAN (*r
)
21298 || REAL_VALUE_ISINF (*r
)
21299 || !real_isinteger (r
, DFmode
))
21302 return exact_log2 (real_to_integer (r
));
21305 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21306 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21307 return n. Otherwise return -1. */
21310 aarch64_fpconst_pow2_recip (rtx x
)
21312 REAL_VALUE_TYPE r0
;
21314 if (!CONST_DOUBLE_P (x
))
21317 r0
= *CONST_DOUBLE_REAL_VALUE (x
);
21318 if (exact_real_inverse (DFmode
, &r0
)
21319 && !REAL_VALUE_NEGATIVE (r0
))
21321 int ret
= exact_log2 (real_to_integer (&r0
));
21322 if (ret
>= 1 && ret
<= 32)
21328 /* If X is a vector of equal CONST_DOUBLE values and that value is
21329 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21332 aarch64_vec_fpconst_pow_of_2 (rtx x
)
21335 if (GET_CODE (x
) != CONST_VECTOR
21336 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
21339 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
21342 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
21346 for (int i
= 1; i
< nelts
; i
++)
21347 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
21353 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21356 __fp16 always promotes through this hook.
21357 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21358 through the generic excess precision logic rather than here. */
21361 aarch64_promoted_type (const_tree t
)
21363 if (SCALAR_FLOAT_TYPE_P (t
)
21364 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
21365 return float_type_node
;
21370 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
21373 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
21374 optimization_type opt_type
)
21379 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
21386 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
21388 static unsigned int
21389 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
21392 /* Polynomial invariant 1 == (VG / 2) - 1. */
21393 gcc_assert (i
== 1);
21396 return AARCH64_DWARF_VG
;
21399 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21400 if MODE is HFmode, and punt to the generic implementation otherwise. */
21403 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
21405 return (mode
== HFmode
21407 : default_libgcc_floating_mode_supported_p (mode
));
21410 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21411 if MODE is HFmode, and punt to the generic implementation otherwise. */
21414 aarch64_scalar_mode_supported_p (scalar_mode mode
)
21416 return (mode
== HFmode
21418 : default_scalar_mode_supported_p (mode
));
21421 /* Set the value of FLT_EVAL_METHOD.
21422 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21424 0: evaluate all operations and constants, whose semantic type has at
21425 most the range and precision of type float, to the range and
21426 precision of float; evaluate all other operations and constants to
21427 the range and precision of the semantic type;
21429 N, where _FloatN is a supported interchange floating type
21430 evaluate all operations and constants, whose semantic type has at
21431 most the range and precision of _FloatN type, to the range and
21432 precision of the _FloatN type; evaluate all other operations and
21433 constants to the range and precision of the semantic type;
21435 If we have the ARMv8.2-A extensions then we support _Float16 in native
21436 precision, so we should set this to 16. Otherwise, we support the type,
21437 but want to evaluate expressions in float precision, so set this to
21440 static enum flt_eval_method
21441 aarch64_excess_precision (enum excess_precision_type type
)
21445 case EXCESS_PRECISION_TYPE_FAST
:
21446 case EXCESS_PRECISION_TYPE_STANDARD
:
21447 /* We can calculate either in 16-bit range and precision or
21448 32-bit range and precision. Make that decision based on whether
21449 we have native support for the ARMv8.2-A 16-bit floating-point
21450 instructions or not. */
21451 return (TARGET_FP_F16INST
21452 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21453 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
21454 case EXCESS_PRECISION_TYPE_IMPLICIT
:
21455 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
21457 gcc_unreachable ();
21459 return FLT_EVAL_METHOD_UNPREDICTABLE
;
21462 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21463 scheduled for speculative execution. Reject the long-running division
21464 and square-root instructions. */
21467 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
21469 switch (get_attr_type (insn
))
21477 case TYPE_NEON_FP_SQRT_S
:
21478 case TYPE_NEON_FP_SQRT_D
:
21479 case TYPE_NEON_FP_SQRT_S_Q
:
21480 case TYPE_NEON_FP_SQRT_D_Q
:
21481 case TYPE_NEON_FP_DIV_S
:
21482 case TYPE_NEON_FP_DIV_D
:
21483 case TYPE_NEON_FP_DIV_S_Q
:
21484 case TYPE_NEON_FP_DIV_D_Q
:
21491 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21494 aarch64_compute_pressure_classes (reg_class
*classes
)
21497 classes
[i
++] = GENERAL_REGS
;
21498 classes
[i
++] = FP_REGS
;
21499 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21500 registers need to go in PR_LO_REGS at some point during their
21501 lifetime. Splitting it into two halves has the effect of making
21502 all predicates count against PR_LO_REGS, so that we try whenever
21503 possible to restrict the number of live predicates to 8. This
21504 greatly reduces the amount of spilling in certain loops. */
21505 classes
[i
++] = PR_LO_REGS
;
21506 classes
[i
++] = PR_HI_REGS
;
21510 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21513 aarch64_can_change_mode_class (machine_mode from
,
21514 machine_mode to
, reg_class_t
)
21516 unsigned int from_flags
= aarch64_classify_vector_mode (from
);
21517 unsigned int to_flags
= aarch64_classify_vector_mode (to
);
21519 bool from_sve_p
= (from_flags
& VEC_ANY_SVE
);
21520 bool to_sve_p
= (to_flags
& VEC_ANY_SVE
);
21522 bool from_partial_sve_p
= from_sve_p
&& (from_flags
& VEC_PARTIAL
);
21523 bool to_partial_sve_p
= to_sve_p
&& (to_flags
& VEC_PARTIAL
);
21525 /* Don't allow changes between partial SVE modes and other modes.
21526 The contents of partial SVE modes are distributed evenly across
21527 the register, whereas GCC expects them to be clustered together. */
21528 if (from_partial_sve_p
!= to_partial_sve_p
)
21531 /* Similarly reject changes between partial SVE modes that have
21532 different patterns of significant and insignificant bits. */
21533 if (from_partial_sve_p
21534 && (aarch64_sve_container_bits (from
) != aarch64_sve_container_bits (to
)
21535 || GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
)))
21538 if (BYTES_BIG_ENDIAN
)
21540 /* Don't allow changes between SVE data modes and non-SVE modes.
21541 See the comment at the head of aarch64-sve.md for details. */
21542 if (from_sve_p
!= to_sve_p
)
21545 /* Don't allow changes in element size: lane 0 of the new vector
21546 would not then be lane 0 of the old vector. See the comment
21547 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21550 In the worst case, this forces a register to be spilled in
21551 one mode and reloaded in the other, which handles the
21552 endianness correctly. */
21553 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
21559 /* Implement TARGET_EARLY_REMAT_MODES. */
21562 aarch64_select_early_remat_modes (sbitmap modes
)
21564 /* SVE values are not normally live across a call, so it should be
21565 worth doing early rematerialization even in VL-specific mode. */
21566 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
21567 if (aarch64_sve_mode_p ((machine_mode
) i
))
21568 bitmap_set_bit (modes
, i
);
21571 /* Override the default target speculation_safe_value. */
21573 aarch64_speculation_safe_value (machine_mode mode
,
21574 rtx result
, rtx val
, rtx failval
)
21576 /* Maybe we should warn if falling back to hard barriers. They are
21577 likely to be noticably more expensive than the alternative below. */
21578 if (!aarch64_track_speculation
)
21579 return default_speculation_safe_value (mode
, result
, val
, failval
);
21582 val
= copy_to_mode_reg (mode
, val
);
21584 if (!aarch64_reg_or_zero (failval
, mode
))
21585 failval
= copy_to_mode_reg (mode
, failval
);
21587 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
21591 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21592 Look into the tuning structure for an estimate.
21593 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21594 Advanced SIMD 128 bits. */
21596 static HOST_WIDE_INT
21597 aarch64_estimated_poly_value (poly_int64 val
)
21599 enum aarch64_sve_vector_bits_enum width_source
21600 = aarch64_tune_params
.sve_width
;
21602 /* If we still don't have an estimate, use the default. */
21603 if (width_source
== SVE_SCALABLE
)
21604 return default_estimated_poly_value (val
);
21606 HOST_WIDE_INT over_128
= width_source
- 128;
21607 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
21611 /* Return true for types that could be supported as SIMD return or
21615 supported_simd_type (tree t
)
21617 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
21619 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
21620 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
21625 /* Return true for types that currently are supported as SIMD return
21626 or argument types. */
21629 currently_supported_simd_type (tree t
, tree b
)
21631 if (COMPLEX_FLOAT_TYPE_P (t
))
21634 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
21637 return supported_simd_type (t
);
21640 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21643 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
21644 struct cgraph_simd_clone
*clonei
,
21645 tree base_type
, int num
)
21647 tree t
, ret_type
, arg_type
;
21648 unsigned int elt_bits
, vec_bits
, count
;
21653 if (clonei
->simdlen
21654 && (clonei
->simdlen
< 2
21655 || clonei
->simdlen
> 1024
21656 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
21658 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21659 "unsupported simdlen %d", clonei
->simdlen
);
21663 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
21664 if (TREE_CODE (ret_type
) != VOID_TYPE
21665 && !currently_supported_simd_type (ret_type
, base_type
))
21667 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
21668 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21669 "GCC does not currently support mixed size types "
21670 "for %<simd%> functions");
21671 else if (supported_simd_type (ret_type
))
21672 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21673 "GCC does not currently support return type %qT "
21674 "for %<simd%> functions", ret_type
);
21676 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21677 "unsupported return type %qT for %<simd%> functions",
21682 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
21684 arg_type
= TREE_TYPE (t
);
21686 if (!currently_supported_simd_type (arg_type
, base_type
))
21688 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
21689 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21690 "GCC does not currently support mixed size types "
21691 "for %<simd%> functions");
21693 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21694 "GCC does not currently support argument type %qT "
21695 "for %<simd%> functions", arg_type
);
21700 clonei
->vecsize_mangle
= 'n';
21701 clonei
->mask_mode
= VOIDmode
;
21702 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
21703 if (clonei
->simdlen
== 0)
21706 vec_bits
= (num
== 0 ? 64 : 128);
21707 clonei
->simdlen
= vec_bits
/ elt_bits
;
21712 vec_bits
= clonei
->simdlen
* elt_bits
;
21713 if (vec_bits
!= 64 && vec_bits
!= 128)
21715 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
21716 "GCC does not currently support simdlen %d for type %qT",
21717 clonei
->simdlen
, base_type
);
21721 clonei
->vecsize_int
= vec_bits
;
21722 clonei
->vecsize_float
= vec_bits
;
21726 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21729 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
21731 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21732 use the correct ABI. */
21734 tree t
= TREE_TYPE (node
->decl
);
21735 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
21736 TYPE_ATTRIBUTES (t
));
21739 /* Implement TARGET_SIMD_CLONE_USABLE. */
21742 aarch64_simd_clone_usable (struct cgraph_node
*node
)
21744 switch (node
->simdclone
->vecsize_mangle
)
21751 gcc_unreachable ();
21755 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21758 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
21760 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
21761 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
21766 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21768 static const char *
21769 aarch64_get_multilib_abi_name (void)
21771 if (TARGET_BIG_END
)
21772 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
21773 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
21776 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21777 global variable based guard use the default else
21778 return a null tree. */
21780 aarch64_stack_protect_guard (void)
21782 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
21783 return default_stack_protect_guard ();
21788 /* Return the diagnostic message string if conversion from FROMTYPE to
21789 TOTYPE is not allowed, NULL otherwise. */
21791 static const char *
21792 aarch64_invalid_conversion (const_tree fromtype
, const_tree totype
)
21794 if (element_mode (fromtype
) != element_mode (totype
))
21796 /* Do no allow conversions to/from BFmode scalar types. */
21797 if (TYPE_MODE (fromtype
) == BFmode
)
21798 return N_("invalid conversion from type %<bfloat16_t%>");
21799 if (TYPE_MODE (totype
) == BFmode
)
21800 return N_("invalid conversion to type %<bfloat16_t%>");
21803 /* Conversion allowed. */
21807 /* Return the diagnostic message string if the unary operation OP is
21808 not permitted on TYPE, NULL otherwise. */
21810 static const char *
21811 aarch64_invalid_unary_op (int op
, const_tree type
)
21813 /* Reject all single-operand operations on BFmode except for &. */
21814 if (element_mode (type
) == BFmode
&& op
!= ADDR_EXPR
)
21815 return N_("operation not permitted on type %<bfloat16_t%>");
21817 /* Operation allowed. */
21821 /* Return the diagnostic message string if the binary operation OP is
21822 not permitted on TYPE1 and TYPE2, NULL otherwise. */
21824 static const char *
21825 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED
, const_tree type1
,
21828 /* Reject all 2-operand operations on BFmode. */
21829 if (element_mode (type1
) == BFmode
21830 || element_mode (type2
) == BFmode
)
21831 return N_("operation not permitted on type %<bfloat16_t%>");
21833 /* Operation allowed. */
21837 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21838 section at the end if needed. */
21839 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21840 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21841 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21843 aarch64_file_end_indicate_exec_stack ()
21845 file_end_indicate_exec_stack ();
21847 unsigned feature_1_and
= 0;
21848 if (aarch64_bti_enabled ())
21849 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
21851 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
21852 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
21856 /* Generate .note.gnu.property section. */
21857 switch_to_section (get_section (".note.gnu.property",
21858 SECTION_NOTYPE
, NULL
));
21860 /* PT_NOTE header: namesz, descsz, type.
21861 namesz = 4 ("GNU\0")
21862 descsz = 16 (Size of the program property array)
21863 [(12 + padding) * Number of array elements]
21864 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
21865 assemble_align (POINTER_SIZE
);
21866 assemble_integer (GEN_INT (4), 4, 32, 1);
21867 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
21868 assemble_integer (GEN_INT (5), 4, 32, 1);
21870 /* PT_NOTE name. */
21871 assemble_string ("GNU", 4);
21873 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21874 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21876 data = feature_1_and. */
21877 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
21878 assemble_integer (GEN_INT (4), 4, 32, 1);
21879 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
21881 /* Pad the size of the note to the required alignment. */
21882 assemble_align (POINTER_SIZE
);
21885 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21886 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21887 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21889 /* Target-specific selftests. */
21893 namespace selftest
{
21895 /* Selftest for the RTL loader.
21896 Verify that the RTL loader copes with a dump from
21897 print_rtx_function. This is essentially just a test that class
21898 function_reader can handle a real dump, but it also verifies
21899 that lookup_reg_by_dump_name correctly handles hard regs.
21900 The presence of hard reg names in the dump means that the test is
21901 target-specific, hence it is in this file. */
21904 aarch64_test_loading_full_dump ()
21906 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
21908 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
21910 rtx_insn
*insn_1
= get_insn_by_uid (1);
21911 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
21913 rtx_insn
*insn_15
= get_insn_by_uid (15);
21914 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
21915 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
21917 /* Verify crtl->return_rtx. */
21918 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
21919 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
21920 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
21923 /* Run all target-specific selftests. */
21926 aarch64_run_selftests (void)
21928 aarch64_test_loading_full_dump ();
21931 } // namespace selftest
21933 #endif /* #if CHECKING_P */
21935 #undef TARGET_STACK_PROTECT_GUARD
21936 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21938 #undef TARGET_ADDRESS_COST
21939 #define TARGET_ADDRESS_COST aarch64_address_cost
21941 /* This hook will determines whether unnamed bitfields affect the alignment
21942 of the containing structure. The hook returns true if the structure
21943 should inherit the alignment requirements of an unnamed bitfield's
21945 #undef TARGET_ALIGN_ANON_BITFIELD
21946 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21948 #undef TARGET_ASM_ALIGNED_DI_OP
21949 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21951 #undef TARGET_ASM_ALIGNED_HI_OP
21952 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21954 #undef TARGET_ASM_ALIGNED_SI_OP
21955 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21957 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21958 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21959 hook_bool_const_tree_hwi_hwi_const_tree_true
21961 #undef TARGET_ASM_FILE_START
21962 #define TARGET_ASM_FILE_START aarch64_start_file
21964 #undef TARGET_ASM_OUTPUT_MI_THUNK
21965 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21967 #undef TARGET_ASM_SELECT_RTX_SECTION
21968 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21970 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21971 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21973 #undef TARGET_BUILD_BUILTIN_VA_LIST
21974 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21976 #undef TARGET_CALLEE_COPIES
21977 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21979 #undef TARGET_CAN_ELIMINATE
21980 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21982 #undef TARGET_CAN_INLINE_P
21983 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21985 #undef TARGET_CANNOT_FORCE_CONST_MEM
21986 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21988 #undef TARGET_CASE_VALUES_THRESHOLD
21989 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21991 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21992 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21994 /* Only the least significant bit is used for initialization guard
21996 #undef TARGET_CXX_GUARD_MASK_BIT
21997 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21999 #undef TARGET_C_MODE_FOR_SUFFIX
22000 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22002 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22003 #undef TARGET_DEFAULT_TARGET_FLAGS
22004 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22007 #undef TARGET_CLASS_MAX_NREGS
22008 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22010 #undef TARGET_BUILTIN_DECL
22011 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22013 #undef TARGET_BUILTIN_RECIPROCAL
22014 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22016 #undef TARGET_C_EXCESS_PRECISION
22017 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22019 #undef TARGET_EXPAND_BUILTIN
22020 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22022 #undef TARGET_EXPAND_BUILTIN_VA_START
22023 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22025 #undef TARGET_FOLD_BUILTIN
22026 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22028 #undef TARGET_FUNCTION_ARG
22029 #define TARGET_FUNCTION_ARG aarch64_function_arg
22031 #undef TARGET_FUNCTION_ARG_ADVANCE
22032 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
22034 #undef TARGET_FUNCTION_ARG_BOUNDARY
22035 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
22037 #undef TARGET_FUNCTION_ARG_PADDING
22038 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
22040 #undef TARGET_GET_RAW_RESULT_MODE
22041 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
22042 #undef TARGET_GET_RAW_ARG_MODE
22043 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
22045 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22046 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
22048 #undef TARGET_FUNCTION_VALUE
22049 #define TARGET_FUNCTION_VALUE aarch64_function_value
22051 #undef TARGET_FUNCTION_VALUE_REGNO_P
22052 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
22054 #undef TARGET_GIMPLE_FOLD_BUILTIN
22055 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
22057 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22058 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
22060 #undef TARGET_INIT_BUILTINS
22061 #define TARGET_INIT_BUILTINS aarch64_init_builtins
22063 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
22064 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
22065 aarch64_ira_change_pseudo_allocno_class
22067 #undef TARGET_LEGITIMATE_ADDRESS_P
22068 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
22070 #undef TARGET_LEGITIMATE_CONSTANT_P
22071 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
22073 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
22074 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
22075 aarch64_legitimize_address_displacement
22077 #undef TARGET_LIBGCC_CMP_RETURN_MODE
22078 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
22080 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
22081 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
22082 aarch64_libgcc_floating_mode_supported_p
22084 #undef TARGET_MANGLE_TYPE
22085 #define TARGET_MANGLE_TYPE aarch64_mangle_type
22087 #undef TARGET_INVALID_CONVERSION
22088 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
22090 #undef TARGET_INVALID_UNARY_OP
22091 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
22093 #undef TARGET_INVALID_BINARY_OP
22094 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
22096 #undef TARGET_VERIFY_TYPE_CONTEXT
22097 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22099 #undef TARGET_MEMORY_MOVE_COST
22100 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22102 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22103 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22105 #undef TARGET_MUST_PASS_IN_STACK
22106 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22108 /* This target hook should return true if accesses to volatile bitfields
22109 should use the narrowest mode possible. It should return false if these
22110 accesses should use the bitfield container type. */
22111 #undef TARGET_NARROW_VOLATILE_BITFIELD
22112 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22114 #undef TARGET_OPTION_OVERRIDE
22115 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22117 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22118 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22119 aarch64_override_options_after_change
22121 #undef TARGET_OPTION_SAVE
22122 #define TARGET_OPTION_SAVE aarch64_option_save
22124 #undef TARGET_OPTION_RESTORE
22125 #define TARGET_OPTION_RESTORE aarch64_option_restore
22127 #undef TARGET_OPTION_PRINT
22128 #define TARGET_OPTION_PRINT aarch64_option_print
22130 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22131 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22133 #undef TARGET_SET_CURRENT_FUNCTION
22134 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22136 #undef TARGET_PASS_BY_REFERENCE
22137 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22139 #undef TARGET_PREFERRED_RELOAD_CLASS
22140 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22142 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22143 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22145 #undef TARGET_PROMOTED_TYPE
22146 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22148 #undef TARGET_SECONDARY_RELOAD
22149 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22151 #undef TARGET_SHIFT_TRUNCATION_MASK
22152 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22154 #undef TARGET_SETUP_INCOMING_VARARGS
22155 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22157 #undef TARGET_STRUCT_VALUE_RTX
22158 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22160 #undef TARGET_REGISTER_MOVE_COST
22161 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22163 #undef TARGET_RETURN_IN_MEMORY
22164 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22166 #undef TARGET_RETURN_IN_MSB
22167 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22169 #undef TARGET_RTX_COSTS
22170 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22172 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22173 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22175 #undef TARGET_SCHED_ISSUE_RATE
22176 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22178 #undef TARGET_SCHED_VARIABLE_ISSUE
22179 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22181 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22182 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22183 aarch64_sched_first_cycle_multipass_dfa_lookahead
22185 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22186 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22187 aarch64_first_cycle_multipass_dfa_lookahead_guard
22189 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22190 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22191 aarch64_get_separate_components
22193 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22194 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22195 aarch64_components_for_bb
22197 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22198 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22199 aarch64_disqualify_components
22201 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22202 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22203 aarch64_emit_prologue_components
22205 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22206 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22207 aarch64_emit_epilogue_components
22209 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22210 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22211 aarch64_set_handled_components
22213 #undef TARGET_TRAMPOLINE_INIT
22214 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22216 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22217 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22219 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22220 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22222 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
22223 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
22225 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22226 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22227 aarch64_builtin_support_vector_misalignment
22229 #undef TARGET_ARRAY_MODE
22230 #define TARGET_ARRAY_MODE aarch64_array_mode
22232 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22233 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22235 #undef TARGET_VECTORIZE_ADD_STMT_COST
22236 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22238 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22239 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22240 aarch64_builtin_vectorization_cost
22242 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22243 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22245 #undef TARGET_VECTORIZE_BUILTINS
22246 #define TARGET_VECTORIZE_BUILTINS
22248 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22249 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22250 aarch64_builtin_vectorized_function
22252 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22253 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22254 aarch64_autovectorize_vector_modes
22256 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22257 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22258 aarch64_atomic_assign_expand_fenv
22260 /* Section anchor support. */
22262 #undef TARGET_MIN_ANCHOR_OFFSET
22263 #define TARGET_MIN_ANCHOR_OFFSET -256
22265 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22266 byte offset; we can do much more for larger data types, but have no way
22267 to determine the size of the access. We assume accesses are aligned. */
22268 #undef TARGET_MAX_ANCHOR_OFFSET
22269 #define TARGET_MAX_ANCHOR_OFFSET 4095
22271 #undef TARGET_VECTOR_ALIGNMENT
22272 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22274 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22275 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22276 aarch64_vectorize_preferred_vector_alignment
22277 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22278 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22279 aarch64_simd_vector_alignment_reachable
22281 /* vec_perm support. */
22283 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22284 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22285 aarch64_vectorize_vec_perm_const
22287 #undef TARGET_VECTORIZE_RELATED_MODE
22288 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22289 #undef TARGET_VECTORIZE_GET_MASK_MODE
22290 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22291 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22292 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22293 aarch64_empty_mask_is_expensive
22294 #undef TARGET_PREFERRED_ELSE_VALUE
22295 #define TARGET_PREFERRED_ELSE_VALUE \
22296 aarch64_preferred_else_value
22298 #undef TARGET_INIT_LIBFUNCS
22299 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22301 #undef TARGET_FIXED_CONDITION_CODE_REGS
22302 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22304 #undef TARGET_FLAGS_REGNUM
22305 #define TARGET_FLAGS_REGNUM CC_REGNUM
22307 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22308 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22310 #undef TARGET_ASAN_SHADOW_OFFSET
22311 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22313 #undef TARGET_LEGITIMIZE_ADDRESS
22314 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22316 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22317 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22319 #undef TARGET_CAN_USE_DOLOOP_P
22320 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22322 #undef TARGET_SCHED_ADJUST_PRIORITY
22323 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22325 #undef TARGET_SCHED_MACRO_FUSION_P
22326 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22328 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22329 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22331 #undef TARGET_SCHED_FUSION_PRIORITY
22332 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22334 #undef TARGET_UNSPEC_MAY_TRAP_P
22335 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22337 #undef TARGET_USE_PSEUDO_PIC_REG
22338 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22340 #undef TARGET_PRINT_OPERAND
22341 #define TARGET_PRINT_OPERAND aarch64_print_operand
22343 #undef TARGET_PRINT_OPERAND_ADDRESS
22344 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22346 #undef TARGET_OPTAB_SUPPORTED_P
22347 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22349 #undef TARGET_OMIT_STRUCT_RETURN_REG
22350 #define TARGET_OMIT_STRUCT_RETURN_REG true
22352 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22353 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22354 aarch64_dwarf_poly_indeterminate_value
22356 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
22357 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22358 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22360 #undef TARGET_HARD_REGNO_NREGS
22361 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22362 #undef TARGET_HARD_REGNO_MODE_OK
22363 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22365 #undef TARGET_MODES_TIEABLE_P
22366 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22368 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22369 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22370 aarch64_hard_regno_call_part_clobbered
22372 #undef TARGET_INSN_CALLEE_ABI
22373 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22375 #undef TARGET_CONSTANT_ALIGNMENT
22376 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22378 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22379 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22380 aarch64_stack_clash_protection_alloca_probe_range
22382 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22383 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22385 #undef TARGET_CAN_CHANGE_MODE_CLASS
22386 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22388 #undef TARGET_SELECT_EARLY_REMAT_MODES
22389 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22391 #undef TARGET_SPECULATION_SAFE_VALUE
22392 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22394 #undef TARGET_ESTIMATED_POLY_VALUE
22395 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22397 #undef TARGET_ATTRIBUTE_TABLE
22398 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22400 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22401 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22402 aarch64_simd_clone_compute_vecsize_and_simdlen
22404 #undef TARGET_SIMD_CLONE_ADJUST
22405 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22407 #undef TARGET_SIMD_CLONE_USABLE
22408 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22410 #undef TARGET_COMP_TYPE_ATTRIBUTES
22411 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22413 #undef TARGET_GET_MULTILIB_ABI_NAME
22414 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22416 #undef TARGET_FNTYPE_ABI
22417 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22420 #undef TARGET_RUN_TARGET_SELFTESTS
22421 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22422 #endif /* #if CHECKING_P */
22424 #undef TARGET_ASM_POST_CFI_STARTPROC
22425 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22427 #undef TARGET_STRICT_ARGUMENT_NAMING
22428 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22430 #undef TARGET_MD_ASM_ADJUST
22431 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22433 struct gcc_target targetm
= TARGET_INITIALIZER
;
22435 #include "gt-aarch64.h"