1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
56 #include "langhooks.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
77 /* This file should be included last. */
78 #include "target-def.h"
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
86 enum insn_type
{ MOV
, MVN
, INDEX
, PTRUE
};
87 enum modifier_type
{ LSL
, MSL
};
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode
, rtx
);
91 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
92 insn_type
= MOV
, modifier_type
= LSL
,
94 simd_immediate_info (scalar_mode
, rtx
, rtx
);
95 simd_immediate_info (scalar_int_mode
, aarch64_svpattern
);
97 /* The mode of the elements. */
100 /* The instruction to use to move the immediate into a vector. */
105 /* For MOV and MVN. */
108 /* The value of each element. */
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier
;
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
126 aarch64_svpattern pattern
;
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
134 : elt_mode (elt_mode_in
), insn (MOV
)
136 u
.mov
.value
= value_in
;
137 u
.mov
.modifier
= LSL
;
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
146 unsigned HOST_WIDE_INT value_in
,
147 insn_type insn_in
, modifier_type modifier_in
,
148 unsigned int shift_in
)
149 : elt_mode (elt_mode_in
), insn (insn_in
)
151 u
.mov
.value
= gen_int_mode (value_in
, elt_mode_in
);
152 u
.mov
.modifier
= modifier_in
;
153 u
.mov
.shift
= shift_in
;
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx base_in
, rtx step_in
)
160 : elt_mode (elt_mode_in
), insn (INDEX
)
162 u
.index
.base
= base_in
;
163 u
.index
.step
= step_in
;
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
170 aarch64_svpattern pattern_in
)
171 : elt_mode (elt_mode_in
), insn (PTRUE
)
173 u
.pattern
= pattern_in
;
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel
;
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg
;
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
187 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
190 machine_mode
*, int *,
192 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
193 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode
);
196 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
201 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
202 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
203 aarch64_addr_query_type
);
204 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
);
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version
;
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune
= cortexa53
;
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags
= 0;
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads
;
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer
;
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string
= NULL
;
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
227 /* Support for command line parsing of boolean flags in the tuning
229 struct aarch64_flag_desc
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
239 { "none", AARCH64_FUSE_NOTHING
},
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL
},
242 { NULL
, AARCH64_FUSE_NOTHING
}
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE
},
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL
},
252 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table
=
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
289 static const struct cpu_addrcost_table xgene1_addrcost_table
=
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
321 static const struct cpu_addrcost_table tsv110_addrcost_table
=
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
353 static const struct cpu_regmove_cost generic_regmove_cost
=
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
363 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
373 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
383 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
393 static const struct cpu_regmove_cost thunderx_regmove_cost
=
401 static const struct cpu_regmove_cost xgene1_regmove_cost
=
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
414 /* Avoid the use of int<->fp moves for spilling. */
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
423 /* Avoid the use of int<->fp moves for spilling. */
429 static const struct cpu_regmove_cost tsv110_regmove_cost
=
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost
=
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost
=
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost
=
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
499 static const struct cpu_vector_cost tsv110_vector_cost
=
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost
=
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
538 static const struct cpu_vector_cost exynosm1_vector_cost
=
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost
=
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost
=
600 1, /* Predictable. */
601 3 /* Unpredictable. */
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes
=
607 AARCH64_APPROX_NONE
, /* division */
608 AARCH64_APPROX_NONE
, /* sqrt */
609 AARCH64_APPROX_NONE
/* recip_sqrt */
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes
=
615 AARCH64_APPROX_NONE
, /* division */
616 AARCH64_APPROX_ALL
, /* sqrt */
617 AARCH64_APPROX_ALL
/* recip_sqrt */
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes
=
623 AARCH64_APPROX_NONE
, /* division */
624 AARCH64_APPROX_NONE
, /* sqrt */
625 AARCH64_APPROX_ALL
/* recip_sqrt */
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune
=
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
640 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
673 static const cpu_prefetch_tune thunderx_prefetch_tune
=
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
695 static const cpu_prefetch_tune tsv110_prefetch_tune
=
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
706 static const cpu_prefetch_tune xgene1_prefetch_tune
=
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
717 static const struct tune_params generic_tunings
=
719 &cortexa57_extra_costs
,
720 &generic_addrcost_table
,
721 &generic_regmove_cost
,
722 &generic_vector_cost
,
723 &generic_branch_cost
,
724 &generic_approx_modes
,
725 SVE_NOT_IMPLEMENTED
, /* sve_width */
728 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
740 &generic_prefetch_tune
743 static const struct tune_params cortexa35_tunings
=
745 &cortexa53_extra_costs
,
746 &generic_addrcost_table
,
747 &cortexa53_regmove_cost
,
748 &generic_vector_cost
,
749 &generic_branch_cost
,
750 &generic_approx_modes
,
751 SVE_NOT_IMPLEMENTED
, /* sve_width */
754 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
767 &generic_prefetch_tune
770 static const struct tune_params cortexa53_tunings
=
772 &cortexa53_extra_costs
,
773 &generic_addrcost_table
,
774 &cortexa53_regmove_cost
,
775 &generic_vector_cost
,
776 &generic_branch_cost
,
777 &generic_approx_modes
,
778 SVE_NOT_IMPLEMENTED
, /* sve_width */
781 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
794 &generic_prefetch_tune
797 static const struct tune_params cortexa57_tunings
=
799 &cortexa57_extra_costs
,
800 &generic_addrcost_table
,
801 &cortexa57_regmove_cost
,
802 &cortexa57_vector_cost
,
803 &generic_branch_cost
,
804 &generic_approx_modes
,
805 SVE_NOT_IMPLEMENTED
, /* sve_width */
808 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
821 &generic_prefetch_tune
824 static const struct tune_params cortexa72_tunings
=
826 &cortexa57_extra_costs
,
827 &generic_addrcost_table
,
828 &cortexa57_regmove_cost
,
829 &cortexa57_vector_cost
,
830 &generic_branch_cost
,
831 &generic_approx_modes
,
832 SVE_NOT_IMPLEMENTED
, /* sve_width */
835 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
848 &generic_prefetch_tune
851 static const struct tune_params cortexa73_tunings
=
853 &cortexa57_extra_costs
,
854 &generic_addrcost_table
,
855 &cortexa57_regmove_cost
,
856 &cortexa57_vector_cost
,
857 &generic_branch_cost
,
858 &generic_approx_modes
,
859 SVE_NOT_IMPLEMENTED
, /* sve_width */
860 4, /* memmov_cost. */
862 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
875 &generic_prefetch_tune
880 static const struct tune_params exynosm1_tunings
=
882 &exynosm1_extra_costs
,
883 &exynosm1_addrcost_table
,
884 &exynosm1_regmove_cost
,
885 &exynosm1_vector_cost
,
886 &generic_branch_cost
,
887 &exynosm1_approx_modes
,
888 SVE_NOT_IMPLEMENTED
, /* sve_width */
891 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
903 &exynosm1_prefetch_tune
906 static const struct tune_params thunderxt88_tunings
=
908 &thunderx_extra_costs
,
909 &generic_addrcost_table
,
910 &thunderx_regmove_cost
,
911 &thunderx_vector_cost
,
912 &generic_branch_cost
,
913 &generic_approx_modes
,
914 SVE_NOT_IMPLEMENTED
, /* sve_width */
917 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
929 &thunderxt88_prefetch_tune
932 static const struct tune_params thunderx_tunings
=
934 &thunderx_extra_costs
,
935 &generic_addrcost_table
,
936 &thunderx_regmove_cost
,
937 &thunderx_vector_cost
,
938 &generic_branch_cost
,
939 &generic_approx_modes
,
940 SVE_NOT_IMPLEMENTED
, /* sve_width */
943 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
956 &thunderx_prefetch_tune
959 static const struct tune_params tsv110_tunings
=
962 &tsv110_addrcost_table
,
963 &tsv110_regmove_cost
,
965 &generic_branch_cost
,
966 &generic_approx_modes
,
967 SVE_NOT_IMPLEMENTED
, /* sve_width */
970 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
983 &tsv110_prefetch_tune
986 static const struct tune_params xgene1_tunings
=
989 &xgene1_addrcost_table
,
990 &xgene1_regmove_cost
,
992 &generic_branch_cost
,
993 &xgene1_approx_modes
,
994 SVE_NOT_IMPLEMENTED
, /* sve_width */
997 AARCH64_FUSE_NOTHING
, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1009 &xgene1_prefetch_tune
1012 static const struct tune_params emag_tunings
=
1014 &xgene1_extra_costs
,
1015 &xgene1_addrcost_table
,
1016 &xgene1_regmove_cost
,
1017 &xgene1_vector_cost
,
1018 &generic_branch_cost
,
1019 &xgene1_approx_modes
,
1020 SVE_NOT_IMPLEMENTED
,
1021 6, /* memmov_cost */
1023 AARCH64_FUSE_NOTHING
, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
1035 &xgene1_prefetch_tune
1038 static const struct tune_params qdf24xx_tunings
=
1040 &qdf24xx_extra_costs
,
1041 &qdf24xx_addrcost_table
,
1042 &qdf24xx_regmove_cost
,
1043 &qdf24xx_vector_cost
,
1044 &generic_branch_cost
,
1045 &generic_approx_modes
,
1046 SVE_NOT_IMPLEMENTED
, /* sve_width */
1047 4, /* memmov_cost */
1049 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS
, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 static const struct tune_params saphira_tunings
=
1069 &generic_extra_costs
,
1070 &generic_addrcost_table
,
1071 &generic_regmove_cost
,
1072 &generic_vector_cost
,
1073 &generic_branch_cost
,
1074 &generic_approx_modes
,
1075 SVE_NOT_IMPLEMENTED
, /* sve_width */
1076 4, /* memmov_cost */
1078 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1091 &generic_prefetch_tune
1094 static const struct tune_params thunderx2t99_tunings
=
1096 &thunderx2t99_extra_costs
,
1097 &thunderx2t99_addrcost_table
,
1098 &thunderx2t99_regmove_cost
,
1099 &thunderx2t99_vector_cost
,
1100 &generic_branch_cost
,
1101 &generic_approx_modes
,
1102 SVE_NOT_IMPLEMENTED
, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1121 static const struct tune_params neoversen1_tunings
=
1123 &cortexa57_extra_costs
,
1124 &generic_addrcost_table
,
1125 &generic_regmove_cost
,
1126 &cortexa57_vector_cost
,
1127 &generic_branch_cost
,
1128 &generic_approx_modes
,
1129 SVE_NOT_IMPLEMENTED
, /* sve_width */
1130 4, /* memmov_cost */
1132 AARCH64_FUSE_AES_AESMC
, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
1144 &generic_prefetch_tune
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1151 void (*parse_override
)(const char*, struct tune_params
*);
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params
*);
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions
[] =
1161 { "fuse", aarch64_parse_fuse_string
},
1162 { "tune", aarch64_parse_tune_string
},
1163 { "sve_width", aarch64_parse_sve_width_string
},
1167 /* A processor implementing AArch64. */
1170 const char *const name
;
1171 enum aarch64_processor ident
;
1172 enum aarch64_processor sched_core
;
1173 enum aarch64_arch arch
;
1174 unsigned architecture_version
;
1175 const uint64_t flags
;
1176 const struct tune_params
*const tune
;
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures
[] =
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores
[] =
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1197 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1198 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor
*selected_arch
;
1205 static const struct processor
*selected_cpu
;
1206 static const struct processor
*selected_tune
;
1208 enum aarch64_key_type aarch64_ra_sign_key
= AARCH64_KEY_A
;
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params
= generic_tunings
;
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table
[] =
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL
, NULL
},
1219 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1227 const char *const name
;
1228 const unsigned long flags_on
;
1229 const unsigned long flags_off
;
1232 typedef enum aarch64_cond_code
1234 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1235 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1236 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1242 struct aarch64_branch_protect_type
1244 /* The type's name that the user passes to the branch-protection option
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1256 enum aarch64_parse_opt_result (*handler
)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type
* subtypes
;
1259 unsigned int num_subtypes
;
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str
, char* rest
)
1265 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
1266 aarch64_enable_bti
= 0;
1269 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1270 return AARCH64_PARSE_INVALID_FEATURE
;
1272 return AARCH64_PARSE_OK
;
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str
, char* rest
)
1278 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1279 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1280 aarch64_enable_bti
= 1;
1283 error ("unexpected %<%s%> after %<%s%>", rest
, str
);
1284 return AARCH64_PARSE_INVALID_FEATURE
;
1286 return AARCH64_PARSE_OK
;
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED
,
1291 char* rest ATTRIBUTE_UNUSED
)
1293 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
1294 aarch64_ra_sign_key
= AARCH64_KEY_A
;
1295 return AARCH64_PARSE_OK
;
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED
,
1300 char* rest ATTRIBUTE_UNUSED
)
1302 aarch64_ra_sign_scope
= AARCH64_FUNCTION_ALL
;
1303 return AARCH64_PARSE_OK
;
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED
,
1308 char* rest ATTRIBUTE_UNUSED
)
1310 aarch64_ra_sign_key
= AARCH64_KEY_B
;
1311 return AARCH64_PARSE_OK
;
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED
,
1316 char* rest ATTRIBUTE_UNUSED
)
1318 aarch64_enable_bti
= 1;
1319 return AARCH64_PARSE_OK
;
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes
[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf
, NULL
, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key
, NULL
, 0 },
1325 { NULL
, NULL
, NULL
, 0 }
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types
[] = {
1329 { "none", aarch64_handle_no_branch_protection
, NULL
, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection
, NULL
, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection
, aarch64_pac_ret_subtypes
,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes
) },
1333 { "bti", aarch64_handle_bti_protection
, NULL
, 0 },
1334 { NULL
, NULL
, NULL
, 0 }
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes
[] =
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes
[] =
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 /* Return the assembly token for svpattern value VALUE. */
1354 svpattern_token (enum aarch64_svpattern pattern
)
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE
)
1361 case AARCH64_NUM_SVPATTERNS
:
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1369 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1370 const char * branch_format
)
1372 rtx_code_label
* tmp_label
= gen_label_rtx ();
1373 char label_buf
[256];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1376 CODE_LABEL_NUMBER (tmp_label
));
1377 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1378 rtx dest_label
= operands
[pos_label
];
1379 operands
[pos_label
] = tmp_label
;
1381 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1382 output_asm_insn (buffer
, operands
);
1384 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1385 operands
[pos_label
] = dest_label
;
1386 output_asm_insn (buffer
, operands
);
1391 aarch64_err_no_fpadvsimd (machine_mode mode
)
1393 if (TARGET_GENERAL_REGS_ONLY
)
1394 if (FLOAT_MODE_P (mode
))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1401 if (FLOAT_MODE_P (mode
))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1426 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1427 reg_class_t best_class
)
1431 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1432 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1433 return allocno_class
;
1435 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1436 || !reg_class_subset_p (FP_REGS
, best_class
))
1439 mode
= PSEUDO_REGNO_MODE (regno
);
1440 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1446 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1447 return aarch64_tune_params
.min_div_recip_mul_sf
;
1448 return aarch64_tune_params
.min_div_recip_mul_df
;
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1453 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1455 if (VECTOR_MODE_P (mode
))
1456 return aarch64_tune_params
.vec_reassoc_width
;
1457 if (INTEGRAL_MODE_P (mode
))
1458 return aarch64_tune_params
.int_reassoc_width
;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1461 return aarch64_tune_params
.fp_reassoc_width
;
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1467 aarch64_dbx_register_number (unsigned regno
)
1469 if (GP_REGNUM_P (regno
))
1470 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1471 else if (regno
== SP_REGNUM
)
1472 return AARCH64_DWARF_SP
;
1473 else if (FP_REGNUM_P (regno
))
1474 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1475 else if (PR_REGNUM_P (regno
))
1476 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1477 else if (regno
== VG_REGNUM
)
1478 return AARCH64_DWARF_VG
;
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS
;
1485 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1487 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1490 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1493 /* Return true if MODE is an SVE predicate mode. */
1495 aarch64_sve_pred_mode_p (machine_mode mode
)
1498 && (mode
== VNx16BImode
1499 || mode
== VNx8BImode
1500 || mode
== VNx4BImode
1501 || mode
== VNx2BImode
));
1504 /* Three mutually-exclusive flags describing a vector or predicate type. */
1505 const unsigned int VEC_ADVSIMD
= 1;
1506 const unsigned int VEC_SVE_DATA
= 2;
1507 const unsigned int VEC_SVE_PRED
= 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509 a structure of 2, 3 or 4 vectors. */
1510 const unsigned int VEC_STRUCT
= 8;
1511 /* Useful combinations of the above. */
1512 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1513 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516 Ignore modes that are not supported by the current target. */
1518 aarch64_classify_vector_mode (machine_mode mode
)
1520 if (aarch64_advsimd_struct_mode_p (mode
))
1521 return VEC_ADVSIMD
| VEC_STRUCT
;
1523 if (aarch64_sve_pred_mode_p (mode
))
1524 return VEC_SVE_PRED
;
1526 /* Make the decision based on the mode's enum value rather than its
1527 properties, so that we keep the correct classification regardless
1528 of -msve-vector-bits. */
1531 /* Single SVE vectors. */
1539 return TARGET_SVE
? VEC_SVE_DATA
: 0;
1541 /* x2 SVE vectors. */
1549 /* x3 SVE vectors. */
1557 /* x4 SVE vectors. */
1565 return TARGET_SVE
? VEC_SVE_DATA
| VEC_STRUCT
: 0;
1567 /* 64-bit Advanced SIMD vectors. */
1571 /* ...E_V1DImode doesn't exist. */
1575 /* 128-bit Advanced SIMD vectors. */
1583 return TARGET_SIMD
? VEC_ADVSIMD
: 0;
1590 /* Return true if MODE is any of the data vector modes, including
1593 aarch64_vector_data_mode_p (machine_mode mode
)
1595 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599 or a structure of vectors. */
1601 aarch64_sve_data_mode_p (machine_mode mode
)
1603 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1606 /* Implement target hook TARGET_ARRAY_MODE. */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1610 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1611 && IN_RANGE (nelems
, 2, 4))
1612 return mode_for_vector (GET_MODE_INNER (mode
),
1613 GET_MODE_NUNITS (mode
) * nelems
);
1615 return opt_machine_mode ();
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1620 aarch64_array_mode_supported_p (machine_mode mode
,
1621 unsigned HOST_WIDE_INT nelems
)
1624 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1625 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1626 && (nelems
>= 2 && nelems
<= 4))
1632 /* Return the SVE predicate mode to use for elements that have
1633 ELEM_NBYTES bytes, if such a mode exists. */
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1640 if (elem_nbytes
== 1)
1642 if (elem_nbytes
== 2)
1644 if (elem_nbytes
== 4)
1646 if (elem_nbytes
== 8)
1649 return opt_machine_mode ();
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1657 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1659 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1660 machine_mode pred_mode
;
1661 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1665 return default_get_mask_mode (nunits
, nbytes
);
1668 /* Return the integer element mode associated with SVE mode MODE. */
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode
)
1673 unsigned int elt_bits
= vector_element_size (BITS_PER_SVE_VECTOR
,
1674 GET_MODE_NUNITS (mode
));
1675 return int_mode_for_size (elt_bits
, 0).require ();
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1679 prefer to use the first arithmetic operand as the else value if
1680 the else value doesn't matter, since that exactly matches the SVE
1681 destructive merging form. For ternary operations we could either
1682 pick the first operand and use FMAD-like instructions or the last
1683 operand and use FMLA-like instructions; the latter seems more
1687 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1689 return nops
== 3 ? ops
[2] : ops
[0];
1692 /* Implement TARGET_HARD_REGNO_NREGS. */
1695 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1697 /* ??? Logically we should only need to provide a value when
1698 HARD_REGNO_MODE_OK says that the combination is valid,
1699 but at the moment we need to handle all modes. Just ignore
1700 any runtime parts for registers that can't store them. */
1701 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1702 switch (aarch64_regno_regclass (regno
))
1707 if (aarch64_sve_data_mode_p (mode
))
1708 return exact_div (GET_MODE_SIZE (mode
),
1709 BYTES_PER_SVE_VECTOR
).to_constant ();
1710 return CEIL (lowest_size
, UNITS_PER_VREG
);
1716 return CEIL (lowest_size
, UNITS_PER_WORD
);
1721 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1724 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1726 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1727 return regno
== CC_REGNUM
;
1729 if (regno
== VG_REGNUM
)
1730 /* This must have the same size as _Unwind_Word. */
1731 return mode
== DImode
;
1733 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1734 if (vec_flags
& VEC_SVE_PRED
)
1735 return PR_REGNUM_P (regno
);
1737 if (PR_REGNUM_P (regno
))
1740 if (regno
== SP_REGNUM
)
1741 /* The purpose of comparing with ptr_mode is to support the
1742 global register variable associated with the stack pointer
1743 register via the syntax of asm ("wsp") in ILP32. */
1744 return mode
== Pmode
|| mode
== ptr_mode
;
1746 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1747 return mode
== Pmode
;
1749 if (GP_REGNUM_P (regno
))
1751 if (known_le (GET_MODE_SIZE (mode
), 8))
1753 else if (known_le (GET_MODE_SIZE (mode
), 16))
1754 return (regno
& 1) == 0;
1756 else if (FP_REGNUM_P (regno
))
1758 if (vec_flags
& VEC_STRUCT
)
1759 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1761 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1767 /* Return true if this is a definition of a vectorized simd function. */
1770 aarch64_simd_decl_p (tree fndecl
)
1776 fntype
= TREE_TYPE (fndecl
);
1780 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1781 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype
)) != NULL
)
1787 /* Return the mode a register save/restore should use. DImode for integer
1788 registers, DFmode for FP registers in non-SIMD functions (they only save
1789 the bottom half of a 128 bit register), or TFmode for FP registers in
1793 aarch64_reg_save_mode (tree fndecl
, unsigned regno
)
1795 return GP_REGNUM_P (regno
)
1797 : (aarch64_simd_decl_p (fndecl
) ? E_TFmode
: E_DFmode
);
1800 /* Return true if the instruction is a call to a SIMD function, false
1801 if it is not a SIMD function or if we do not know anything about
1805 aarch64_simd_call_p (rtx_insn
*insn
)
1811 gcc_assert (CALL_P (insn
));
1812 call
= get_call_rtx_from (insn
);
1813 symbol
= XEXP (XEXP (call
, 0), 0);
1814 if (GET_CODE (symbol
) != SYMBOL_REF
)
1816 fndecl
= SYMBOL_REF_DECL (symbol
);
1820 return aarch64_simd_decl_p (fndecl
);
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1824 a function that uses the SIMD ABI, take advantage of the extra
1825 call-preserved registers that the ABI provides. */
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn
*insn
,
1829 HARD_REG_SET
*return_set
)
1831 if (aarch64_simd_call_p (insn
))
1833 for (int regno
= 0; regno
< FIRST_PSEUDO_REGISTER
; regno
++)
1834 if (FP_SIMD_SAVED_REGNUM_P (regno
))
1835 CLEAR_HARD_REG_BIT (*return_set
, regno
);
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1840 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1841 clobbers the top 64 bits when restoring the bottom 64 bits. */
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn
*insn
, unsigned int regno
,
1847 bool simd_p
= insn
&& CALL_P (insn
) && aarch64_simd_call_p (insn
);
1848 return FP_REGNUM_P (regno
)
1849 && maybe_gt (GET_MODE_SIZE (mode
), simd_p
? 16 : 8);
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1855 aarch64_return_call_with_max_clobbers (rtx_insn
*call_1
, rtx_insn
*call_2
)
1857 gcc_assert (CALL_P (call_1
) && CALL_P (call_2
));
1859 if (!aarch64_simd_call_p (call_1
) || aarch64_simd_call_p (call_2
))
1865 /* Implement REGMODE_NATURAL_SIZE. */
1867 aarch64_regmode_natural_size (machine_mode mode
)
1869 /* The natural size for SVE data modes is one SVE data vector,
1870 and similarly for predicates. We can't independently modify
1871 anything smaller than that. */
1872 /* ??? For now, only do this for variable-width SVE registers.
1873 Doing it for constant-sized registers breaks lower-subreg.c. */
1874 /* ??? And once that's fixed, we should probably have similar
1875 code for Advanced SIMD. */
1876 if (!aarch64_sve_vg
.is_constant ())
1878 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1879 if (vec_flags
& VEC_SVE_PRED
)
1880 return BYTES_PER_SVE_PRED
;
1881 if (vec_flags
& VEC_SVE_DATA
)
1882 return BYTES_PER_SVE_VECTOR
;
1884 return UNITS_PER_WORD
;
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1889 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1892 /* The predicate mode determines which bits are significant and
1893 which are "don't care". Decreasing the number of lanes would
1894 lose data while increasing the number of lanes would make bits
1895 unnecessarily significant. */
1896 if (PR_REGNUM_P (regno
))
1898 if (known_ge (GET_MODE_SIZE (mode
), 4))
1904 /* Return true if I's bits are consecutive ones from the MSB. */
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i
)
1908 return exact_log2 (-i
) != HOST_WIDE_INT_M1
;
1911 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1912 that strcpy from constants will be faster. */
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1917 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1918 return MAX (align
, BITS_PER_WORD
);
1922 /* Return true if calls to DECL should be treated as
1923 long-calls (ie called via a register). */
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931 long-calls (ie called via a register). */
1933 aarch64_is_long_call_p (rtx sym
)
1935 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1938 /* Return true if calls to symbol-ref SYM should not go through
1942 aarch64_is_noplt_call_p (rtx sym
)
1944 const_tree decl
= SYMBOL_REF_DECL (sym
);
1949 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1950 && !targetm
.binds_local_p (decl
))
1956 /* Return true if the offsets to a zero/sign-extract operation
1957 represent an expression that matches an extend operation. The
1958 operands represent the paramters from
1960 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1962 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1965 HOST_WIDE_INT mult_val
, extract_val
;
1967 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1970 mult_val
= INTVAL (mult_imm
);
1971 extract_val
= INTVAL (extract_imm
);
1974 && extract_val
< GET_MODE_BITSIZE (mode
)
1975 && exact_log2 (extract_val
& ~7) > 0
1976 && (extract_val
& 7) <= 4
1977 && mult_val
== (1 << (extract_val
& 7)))
1983 /* Emit an insn that's a simple single-set. Both the operands must be
1984 known to be valid. */
1985 inline static rtx_insn
*
1986 emit_set_insn (rtx x
, rtx y
)
1988 return emit_insn (gen_rtx_SET (x
, y
));
1991 /* X and Y are two things to compare using CODE. Emit the compare insn and
1992 return the rtx for register 0 in the proper mode. */
1994 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1996 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1997 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1999 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code
, rtx x
, rtx y
,
2007 machine_mode y_mode
)
2009 if (y_mode
== E_QImode
|| y_mode
== E_HImode
)
2011 if (CONST_INT_P (y
))
2012 y
= GEN_INT (INTVAL (y
) & GET_MODE_MASK (y_mode
));
2016 machine_mode cc_mode
;
2018 t
= gen_rtx_ZERO_EXTEND (SImode
, y
);
2019 t
= gen_rtx_COMPARE (CC_SWPmode
, t
, x
);
2020 cc_mode
= CC_SWPmode
;
2021 cc_reg
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
2022 emit_set_insn (cc_reg
, t
);
2027 return aarch64_gen_compare_reg (code
, x
, y
);
2030 /* Build the SYMBOL_REF for __tls_get_addr. */
2032 static GTY(()) rtx tls_get_addr_libfunc
;
2035 aarch64_tls_get_addr (void)
2037 if (!tls_get_addr_libfunc
)
2038 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
2039 return tls_get_addr_libfunc
;
2042 /* Return the TLS model to use for ADDR. */
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr
)
2047 enum tls_model tls_kind
= TLS_MODEL_NONE
;
2048 if (GET_CODE (addr
) == CONST
)
2051 rtx sym
= strip_offset (addr
, &addend
);
2052 if (GET_CODE (sym
) == SYMBOL_REF
)
2053 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
2055 else if (GET_CODE (addr
) == SYMBOL_REF
)
2056 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062 so that combine would take care of combining addresses where
2063 necessary, but for generation purposes, we'll generate the address
2066 tmp = hi (symbol_ref); adrp x1, foo
2067 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2071 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2072 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2076 Load TLS symbol, depending on TLS mechanism and TLS access model.
2078 Global Dynamic - Traditional TLS:
2079 adrp tmp, :tlsgd:imm
2080 add dest, tmp, #:tlsgd_lo12:imm
2083 Global Dynamic - TLS Descriptors:
2084 adrp dest, :tlsdesc:imm
2085 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2086 add dest, dest, #:tlsdesc_lo12:imm
2093 adrp tmp, :gottprel:imm
2094 ldr dest, [tmp, #:gottprel_lo12:imm]
2099 add t0, tp, #:tprel_hi12:imm, lsl #12
2100 add t0, t0, #:tprel_lo12_nc:imm
2104 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
2105 enum aarch64_symbol_type type
)
2109 case SYMBOL_SMALL_ABSOLUTE
:
2111 /* In ILP32, the mode of dest can be either SImode or DImode. */
2113 machine_mode mode
= GET_MODE (dest
);
2115 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2117 if (can_create_pseudo_p ())
2118 tmp_reg
= gen_reg_rtx (mode
);
2120 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2121 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
2125 case SYMBOL_TINY_ABSOLUTE
:
2126 emit_insn (gen_rtx_SET (dest
, imm
));
2129 case SYMBOL_SMALL_GOT_28K
:
2131 machine_mode mode
= GET_MODE (dest
);
2132 rtx gp_rtx
= pic_offset_table_rtx
;
2136 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137 here before rtl expand. Tree IVOPT will generate rtl pattern to
2138 decide rtx costs, in which case pic_offset_table_rtx is not
2139 initialized. For that case no need to generate the first adrp
2140 instruction as the final cost for global variable access is
2144 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145 using the page base as GOT base, the first page may be wasted,
2146 in the worst scenario, there is only 28K space for GOT).
2148 The generate instruction sequence for accessing global variable
2151 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2153 Only one instruction needed. But we must initialize
2154 pic_offset_table_rtx properly. We generate initialize insn for
2155 every global access, and allow CSE to remove all redundant.
2157 The final instruction sequences will look like the following
2158 for multiply global variables access.
2160 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2162 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2167 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
2168 crtl
->uses_pic_offset_table
= 1;
2169 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
2171 if (mode
!= GET_MODE (gp_rtx
))
2172 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
2176 if (mode
== ptr_mode
)
2179 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
2181 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
2183 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2187 gcc_assert (mode
== Pmode
);
2189 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
2190 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2193 /* The operand is expected to be MEM. Whenever the related insn
2194 pattern changed, above code which calculate mem should be
2196 gcc_assert (GET_CODE (mem
) == MEM
);
2197 MEM_READONLY_P (mem
) = 1;
2198 MEM_NOTRAP_P (mem
) = 1;
2203 case SYMBOL_SMALL_GOT_4G
:
2205 /* In ILP32, the mode of dest can be either SImode or DImode,
2206 while the got entry is always of SImode size. The mode of
2207 dest depends on how dest is used: if dest is assigned to a
2208 pointer (e.g. in the memory), it has SImode; it may have
2209 DImode if dest is dereferenced to access the memeory.
2210 This is why we have to handle three different ldr_got_small
2211 patterns here (two patterns for ILP32). */
2216 machine_mode mode
= GET_MODE (dest
);
2218 if (can_create_pseudo_p ())
2219 tmp_reg
= gen_reg_rtx (mode
);
2221 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
2222 if (mode
== ptr_mode
)
2225 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
2227 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
2229 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
2233 gcc_assert (mode
== Pmode
);
2235 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
2236 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
2239 gcc_assert (GET_CODE (mem
) == MEM
);
2240 MEM_READONLY_P (mem
) = 1;
2241 MEM_NOTRAP_P (mem
) = 1;
2246 case SYMBOL_SMALL_TLSGD
:
2249 machine_mode mode
= GET_MODE (dest
);
2250 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
2254 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
2256 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
2257 insns
= get_insns ();
2260 RTL_CONST_CALL_P (insns
) = 1;
2261 emit_libcall_block (insns
, dest
, result
, imm
);
2265 case SYMBOL_SMALL_TLSDESC
:
2267 machine_mode mode
= GET_MODE (dest
);
2268 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
2271 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
2273 /* In ILP32, the got entry is always of SImode size. Unlike
2274 small GOT, the dest is fixed at reg 0. */
2276 emit_insn (gen_tlsdesc_small_si (imm
));
2278 emit_insn (gen_tlsdesc_small_di (imm
));
2279 tp
= aarch64_load_tp (NULL
);
2282 tp
= gen_lowpart (mode
, tp
);
2284 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
2286 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2290 case SYMBOL_SMALL_TLSIE
:
2292 /* In ILP32, the mode of dest can be either SImode or DImode,
2293 while the got entry is always of SImode size. The mode of
2294 dest depends on how dest is used: if dest is assigned to a
2295 pointer (e.g. in the memory), it has SImode; it may have
2296 DImode if dest is dereferenced to access the memeory.
2297 This is why we have to handle three different tlsie_small
2298 patterns here (two patterns for ILP32). */
2299 machine_mode mode
= GET_MODE (dest
);
2300 rtx tmp_reg
= gen_reg_rtx (mode
);
2301 rtx tp
= aarch64_load_tp (NULL
);
2303 if (mode
== ptr_mode
)
2306 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
2309 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
2310 tp
= gen_lowpart (mode
, tp
);
2315 gcc_assert (mode
== Pmode
);
2316 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
2319 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
2321 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2325 case SYMBOL_TLSLE12
:
2326 case SYMBOL_TLSLE24
:
2327 case SYMBOL_TLSLE32
:
2328 case SYMBOL_TLSLE48
:
2330 machine_mode mode
= GET_MODE (dest
);
2331 rtx tp
= aarch64_load_tp (NULL
);
2334 tp
= gen_lowpart (mode
, tp
);
2338 case SYMBOL_TLSLE12
:
2339 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
2342 case SYMBOL_TLSLE24
:
2343 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
2346 case SYMBOL_TLSLE32
:
2347 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
2349 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2352 case SYMBOL_TLSLE48
:
2353 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
2355 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
2363 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2367 case SYMBOL_TINY_GOT
:
2368 emit_insn (gen_ldr_got_tiny (dest
, imm
));
2371 case SYMBOL_TINY_TLSIE
:
2373 machine_mode mode
= GET_MODE (dest
);
2374 rtx tp
= aarch64_load_tp (NULL
);
2376 if (mode
== ptr_mode
)
2379 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
2382 tp
= gen_lowpart (mode
, tp
);
2383 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
2388 gcc_assert (mode
== Pmode
);
2389 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
2402 /* Emit a move from SRC to DEST. Assume that the move expanders can
2403 handle all moves if !can_create_pseudo_p (). The distinction is
2404 important because, unlike emit_move_insn, the move expanders know
2405 how to force Pmode objects into the constant pool even when the
2406 constant pool address is not itself legitimate. */
2408 aarch64_emit_move (rtx dest
, rtx src
)
2410 return (can_create_pseudo_p ()
2411 ? emit_move_insn (dest
, src
)
2412 : emit_move_insn_1 (dest
, src
));
2415 /* Apply UNOPTAB to OP and store the result in DEST. */
2418 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
2420 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
2422 emit_move_insn (dest
, tmp
);
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2428 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
2430 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
2433 emit_move_insn (dest
, tmp
);
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437 taking care to handle partial overlap of register to register
2438 copies. Special cases are needed when moving between GP regs and
2439 FP regs. SRC can be a register, constant or memory; DST a register
2440 or memory. If either operand is memory it must not have any side
2443 aarch64_split_128bit_move (rtx dst
, rtx src
)
2448 machine_mode mode
= GET_MODE (dst
);
2450 gcc_assert (mode
== TImode
|| mode
== TFmode
);
2451 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
2452 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
2454 if (REG_P (dst
) && REG_P (src
))
2456 int src_regno
= REGNO (src
);
2457 int dst_regno
= REGNO (dst
);
2459 /* Handle FP <-> GP regs. */
2460 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
2462 src_lo
= gen_lowpart (word_mode
, src
);
2463 src_hi
= gen_highpart (word_mode
, src
);
2465 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
2466 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
2469 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
2471 dst_lo
= gen_lowpart (word_mode
, dst
);
2472 dst_hi
= gen_highpart (word_mode
, dst
);
2474 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
2475 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
2480 dst_lo
= gen_lowpart (word_mode
, dst
);
2481 dst_hi
= gen_highpart (word_mode
, dst
);
2482 src_lo
= gen_lowpart (word_mode
, src
);
2483 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2485 /* At most one pairing may overlap. */
2486 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2488 aarch64_emit_move (dst_hi
, src_hi
);
2489 aarch64_emit_move (dst_lo
, src_lo
);
2493 aarch64_emit_move (dst_lo
, src_lo
);
2494 aarch64_emit_move (dst_hi
, src_hi
);
2499 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2501 return (! REG_P (src
)
2502 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2505 /* Split a complex SIMD combine. */
2508 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2510 machine_mode src_mode
= GET_MODE (src1
);
2511 machine_mode dst_mode
= GET_MODE (dst
);
2513 gcc_assert (VECTOR_MODE_P (dst_mode
));
2514 gcc_assert (register_operand (dst
, dst_mode
)
2515 && register_operand (src1
, src_mode
)
2516 && register_operand (src2
, src_mode
));
2518 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2522 /* Split a complex SIMD move. */
2525 aarch64_split_simd_move (rtx dst
, rtx src
)
2527 machine_mode src_mode
= GET_MODE (src
);
2528 machine_mode dst_mode
= GET_MODE (dst
);
2530 gcc_assert (VECTOR_MODE_P (dst_mode
));
2532 if (REG_P (dst
) && REG_P (src
))
2534 gcc_assert (VECTOR_MODE_P (src_mode
));
2535 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2540 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2541 machine_mode ymode
, rtx y
)
2543 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2544 gcc_assert (r
!= NULL
);
2545 return rtx_equal_p (x
, r
);
2549 /* Return TARGET if it is nonnull and a register of mode MODE.
2550 Otherwise, return a fresh register of mode MODE if we can,
2551 or TARGET reinterpreted as MODE if we can't. */
2554 aarch64_target_reg (rtx target
, machine_mode mode
)
2556 if (target
&& REG_P (target
) && GET_MODE (target
) == mode
)
2558 if (!can_create_pseudo_p ())
2560 gcc_assert (target
);
2561 return gen_lowpart (mode
, target
);
2563 return gen_reg_rtx (mode
);
2566 /* Return a register that contains the constant in BUILDER, given that
2567 the constant is a legitimate move operand. Use TARGET as the register
2568 if it is nonnull and convenient. */
2571 aarch64_emit_set_immediate (rtx target
, rtx_vector_builder
&builder
)
2573 rtx src
= builder
.build ();
2574 target
= aarch64_target_reg (target
, GET_MODE (src
));
2575 emit_insn (gen_rtx_SET (target
, src
));
2580 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2582 if (can_create_pseudo_p ())
2583 return force_reg (mode
, value
);
2587 aarch64_emit_move (x
, value
);
2592 /* Return true if predicate value X is a constant in which every element
2593 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2594 value, i.e. as a predicate in which all bits are significant. */
2597 aarch64_get_sve_pred_bits (rtx_vector_builder
&builder
, rtx x
)
2599 if (GET_CODE (x
) != CONST_VECTOR
)
2602 unsigned int factor
= vector_element_size (GET_MODE_NUNITS (VNx16BImode
),
2603 GET_MODE_NUNITS (GET_MODE (x
)));
2604 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (x
) * factor
;
2605 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (x
);
2606 builder
.new_vector (VNx16BImode
, npatterns
, nelts_per_pattern
);
2608 unsigned int nelts
= const_vector_encoded_nelts (x
);
2609 for (unsigned int i
= 0; i
< nelts
; ++i
)
2611 rtx elt
= CONST_VECTOR_ENCODED_ELT (x
, i
);
2612 if (!CONST_INT_P (elt
))
2615 builder
.quick_push (elt
);
2616 for (unsigned int j
= 1; j
< factor
; ++j
)
2617 builder
.quick_push (const0_rtx
);
2619 builder
.finalize ();
2623 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2624 widest predicate element size it can have (that is, the largest size
2625 for which each element would still be 0 or 1). */
2628 aarch64_widest_sve_pred_elt_size (rtx_vector_builder
&builder
)
2630 /* Start with the most optimistic assumption: that we only need
2631 one bit per pattern. This is what we will use if only the first
2632 bit in each pattern is ever set. */
2633 unsigned int mask
= GET_MODE_SIZE (DImode
);
2634 mask
|= builder
.npatterns ();
2636 /* Look for set bits. */
2637 unsigned int nelts
= builder
.encoded_nelts ();
2638 for (unsigned int i
= 1; i
< nelts
; ++i
)
2639 if (INTVAL (builder
.elt (i
)) != 0)
2645 return mask
& -mask
;
2648 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2649 that the constant would have with predicate element size ELT_SIZE
2650 (ignoring the upper bits in each element) and return:
2652 * -1 if all bits are set
2653 * N if the predicate has N leading set bits followed by all clear bits
2654 * 0 if the predicate does not have any of these forms. */
2657 aarch64_partial_ptrue_length (rtx_vector_builder
&builder
,
2658 unsigned int elt_size
)
2660 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2661 followed by set bits. */
2662 if (builder
.nelts_per_pattern () == 3)
2665 /* Skip over leading set bits. */
2666 unsigned int nelts
= builder
.encoded_nelts ();
2668 for (; i
< nelts
; i
+= elt_size
)
2669 if (INTVAL (builder
.elt (i
)) == 0)
2671 unsigned int vl
= i
/ elt_size
;
2673 /* Check for the all-true case. */
2677 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2678 repeating pattern of set bits followed by clear bits. */
2679 if (builder
.nelts_per_pattern () != 2)
2682 /* We have a "foreground" value and a duplicated "background" value.
2683 If the background might repeat and the last set bit belongs to it,
2684 we might have set bits followed by clear bits followed by set bits. */
2685 if (i
> builder
.npatterns () && maybe_ne (nelts
, builder
.full_nelts ()))
2688 /* Make sure that the rest are all clear. */
2689 for (; i
< nelts
; i
+= elt_size
)
2690 if (INTVAL (builder
.elt (i
)) != 0)
2696 /* See if there is an svpattern that encodes an SVE predicate of mode
2697 PRED_MODE in which the first VL bits are set and the rest are clear.
2698 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2699 A VL of -1 indicates an all-true vector. */
2702 aarch64_svpattern_for_vl (machine_mode pred_mode
, int vl
)
2705 return AARCH64_SV_ALL
;
2707 if (maybe_gt (vl
, GET_MODE_NUNITS (pred_mode
)))
2708 return AARCH64_NUM_SVPATTERNS
;
2710 if (vl
>= 1 && vl
<= 8)
2711 return aarch64_svpattern (AARCH64_SV_VL1
+ (vl
- 1));
2713 if (vl
>= 16 && vl
<= 256 && pow2p_hwi (vl
))
2714 return aarch64_svpattern (AARCH64_SV_VL16
+ (exact_log2 (vl
) - 4));
2717 if (GET_MODE_NUNITS (pred_mode
).is_constant (&max_vl
))
2719 if (vl
== (max_vl
/ 3) * 3)
2720 return AARCH64_SV_MUL3
;
2721 /* These would only trigger for non-power-of-2 lengths. */
2722 if (vl
== (max_vl
& -4))
2723 return AARCH64_SV_MUL4
;
2724 if (vl
== (1 << floor_log2 (max_vl
)))
2725 return AARCH64_SV_POW2
;
2727 return AARCH64_SV_ALL
;
2729 return AARCH64_NUM_SVPATTERNS
;
2732 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2733 bits has the lowest bit set and the upper bits clear. This is the
2734 VNx16BImode equivalent of a PTRUE for controlling elements of
2735 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2736 all bits are significant, even the upper zeros. */
2739 aarch64_ptrue_all (unsigned int elt_size
)
2741 rtx_vector_builder
builder (VNx16BImode
, elt_size
, 1);
2742 builder
.quick_push (const1_rtx
);
2743 for (unsigned int i
= 1; i
< elt_size
; ++i
)
2744 builder
.quick_push (const0_rtx
);
2745 return builder
.build ();
2748 /* Return an all-true predicate register of mode MODE. */
2751 aarch64_ptrue_reg (machine_mode mode
)
2753 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2754 rtx reg
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
2755 return gen_lowpart (mode
, reg
);
2758 /* Return an all-false predicate register of mode MODE. */
2761 aarch64_pfalse_reg (machine_mode mode
)
2763 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
);
2764 rtx reg
= force_reg (VNx16BImode
, CONST0_RTX (VNx16BImode
));
2765 return gen_lowpart (mode
, reg
);
2768 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2769 true, or alternatively if we know that the operation predicated by
2770 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2771 aarch64_sve_gp_strictness operand that describes the operation
2772 predicated by PRED1[0]. */
2775 aarch64_sve_pred_dominates_p (rtx
*pred1
, rtx pred2
)
2777 machine_mode mode
= GET_MODE (pred2
);
2778 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2779 && mode
== GET_MODE (pred1
[0])
2780 && aarch64_sve_gp_strictness (pred1
[1], SImode
));
2781 return (pred1
[0] == CONSTM1_RTX (mode
)
2782 || INTVAL (pred1
[1]) == SVE_RELAXED_GP
2783 || rtx_equal_p (pred1
[0], pred2
));
2786 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2787 for it. PRED2[0] is the predicate for the instruction whose result
2788 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2789 for it. Return true if we can prove that the two predicates are
2790 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2791 with PRED1[0] without changing behavior. */
2794 aarch64_sve_same_pred_for_ptest_p (rtx
*pred1
, rtx
*pred2
)
2796 machine_mode mode
= GET_MODE (pred1
[0]);
2797 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
2798 && mode
== GET_MODE (pred2
[0])
2799 && aarch64_sve_ptrue_flag (pred1
[1], SImode
)
2800 && aarch64_sve_ptrue_flag (pred2
[1], SImode
));
2802 bool ptrue1_p
= (pred1
[0] == CONSTM1_RTX (mode
)
2803 || INTVAL (pred1
[1]) == SVE_KNOWN_PTRUE
);
2804 bool ptrue2_p
= (pred2
[0] == CONSTM1_RTX (mode
)
2805 || INTVAL (pred2
[1]) == SVE_KNOWN_PTRUE
);
2806 return (ptrue1_p
&& ptrue2_p
) || rtx_equal_p (pred1
[0], pred2
[0]);
2809 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2810 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2811 Use TARGET as the target register if nonnull and convenient. */
2814 aarch64_sve_emit_int_cmp (rtx target
, machine_mode pred_mode
, rtx_code cmp
,
2815 machine_mode data_mode
, rtx op1
, rtx op2
)
2817 insn_code icode
= code_for_aarch64_pred_cmp (cmp
, data_mode
);
2818 expand_operand ops
[5];
2819 create_output_operand (&ops
[0], target
, pred_mode
);
2820 create_input_operand (&ops
[1], CONSTM1_RTX (pred_mode
), pred_mode
);
2821 create_integer_operand (&ops
[2], SVE_KNOWN_PTRUE
);
2822 create_input_operand (&ops
[3], op1
, data_mode
);
2823 create_input_operand (&ops
[4], op2
, data_mode
);
2824 expand_insn (icode
, 5, ops
);
2825 return ops
[0].value
;
2828 /* Use a comparison to convert integer vector SRC into MODE, which is
2829 the corresponding SVE predicate mode. Use TARGET for the result
2830 if it's nonnull and convenient. */
2833 aarch64_convert_sve_data_to_pred (rtx target
, machine_mode mode
, rtx src
)
2835 machine_mode src_mode
= GET_MODE (src
);
2836 return aarch64_sve_emit_int_cmp (target
, mode
, NE
, src_mode
,
2837 src
, CONST0_RTX (src_mode
));
2840 /* Return true if we can move VALUE into a register using a single
2841 CNT[BHWD] instruction. */
2844 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2846 HOST_WIDE_INT factor
= value
.coeffs
[0];
2847 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2848 return (value
.coeffs
[1] == factor
2849 && IN_RANGE (factor
, 2, 16 * 16)
2850 && (factor
& 1) == 0
2851 && factor
<= 16 * (factor
& -factor
));
2854 /* Likewise for rtx X. */
2857 aarch64_sve_cnt_immediate_p (rtx x
)
2860 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2863 /* Return the asm string for an instruction with a CNT-like vector size
2864 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2865 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2866 first part of the operands template (the part that comes before the
2867 vector size itself). FACTOR is the number of quadwords.
2868 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2869 If it is zero, we can use any element size. */
2872 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2873 unsigned int factor
,
2874 unsigned int nelts_per_vq
)
2876 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2878 if (nelts_per_vq
== 0)
2879 /* There is some overlap in the ranges of the four CNT instructions.
2880 Here we always use the smallest possible element size, so that the
2881 multiplier is 1 whereever possible. */
2882 nelts_per_vq
= factor
& -factor
;
2883 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2884 gcc_assert (IN_RANGE (shift
, 1, 4));
2885 char suffix
= "dwhb"[shift
- 1];
2888 unsigned int written
;
2890 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2891 prefix
, suffix
, operands
);
2893 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2894 prefix
, suffix
, operands
, factor
);
2895 gcc_assert (written
< sizeof (buffer
));
2899 /* Return the asm string for an instruction with a CNT-like vector size
2900 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2901 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2902 first part of the operands template (the part that comes before the
2903 vector size itself). X is the value of the vector size operand,
2904 as a polynomial integer rtx. */
2907 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2910 poly_int64 value
= rtx_to_poly_int64 (x
);
2911 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2912 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2913 value
.coeffs
[1], 0);
2916 /* Return true if we can add VALUE to a register using a single ADDVL
2917 or ADDPL instruction. */
2920 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2922 HOST_WIDE_INT factor
= value
.coeffs
[0];
2923 if (factor
== 0 || value
.coeffs
[1] != factor
)
2925 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2926 and a value of 16 is one vector width. */
2927 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2928 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2931 /* Likewise for rtx X. */
2934 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2937 return (poly_int_rtx_p (x
, &value
)
2938 && aarch64_sve_addvl_addpl_immediate_p (value
));
2941 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2942 and storing the result in operand 0. */
2945 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2947 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2948 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2949 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2951 /* Use INC or DEC if possible. */
2952 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2954 if (aarch64_sve_cnt_immediate_p (offset_value
))
2955 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2956 offset_value
.coeffs
[1], 0);
2957 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2958 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2959 -offset_value
.coeffs
[1], 0);
2962 int factor
= offset_value
.coeffs
[1];
2963 if ((factor
& 15) == 0)
2964 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2966 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2970 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2971 instruction. If it is, store the number of elements in each vector
2972 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2973 factor in *FACTOR_OUT (if nonnull). */
2976 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2977 unsigned int *nelts_per_vq_out
)
2982 if (!const_vec_duplicate_p (x
, &elt
)
2983 || !poly_int_rtx_p (elt
, &value
))
2986 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2987 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2988 /* There's no vector INCB. */
2991 HOST_WIDE_INT factor
= value
.coeffs
[0];
2992 if (value
.coeffs
[1] != factor
)
2995 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2996 if ((factor
% nelts_per_vq
) != 0
2997 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
3001 *factor_out
= factor
;
3002 if (nelts_per_vq_out
)
3003 *nelts_per_vq_out
= nelts_per_vq
;
3007 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3011 aarch64_sve_inc_dec_immediate_p (rtx x
)
3013 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
3016 /* Return the asm template for an SVE vector INC or DEC instruction.
3017 OPERANDS gives the operands before the vector count and X is the
3018 value of the vector count operand itself. */
3021 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
3024 unsigned int nelts_per_vq
;
3025 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
3028 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
3031 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
3036 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
3037 scalar_int_mode mode
)
3040 unsigned HOST_WIDE_INT val
, val2
, mask
;
3041 int one_match
, zero_match
;
3046 if (aarch64_move_imm (val
, mode
))
3049 emit_insn (gen_rtx_SET (dest
, imm
));
3053 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3054 (with XXXX non-zero). In that case check to see if the move can be done in
3056 val2
= val
& 0xffffffff;
3058 && aarch64_move_imm (val2
, SImode
)
3059 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
3062 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3064 /* Check if we have to emit a second instruction by checking to see
3065 if any of the upper 32 bits of the original DI mode value is set. */
3069 i
= (val
>> 48) ? 48 : 32;
3072 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3073 GEN_INT ((val
>> i
) & 0xffff)));
3078 if ((val
>> 32) == 0 || mode
== SImode
)
3082 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
3084 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
3085 GEN_INT ((val
>> 16) & 0xffff)));
3087 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
3088 GEN_INT ((val
>> 16) & 0xffff)));
3093 /* Remaining cases are all for DImode. */
3096 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
3097 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
3098 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
3099 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
3101 if (zero_match
!= 2 && one_match
!= 2)
3103 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3104 For a 64-bit bitmask try whether changing 16 bits to all ones or
3105 zeroes creates a valid bitmask. To check any repeated bitmask,
3106 try using 16 bits from the other 32-bit half of val. */
3108 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
3111 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3114 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3116 val2
= val2
& ~mask
;
3117 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
3118 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
3125 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
3126 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3127 GEN_INT ((val
>> i
) & 0xffff)));
3133 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3134 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3135 otherwise skip zero bits. */
3139 val2
= one_match
> zero_match
? ~val
: val
;
3140 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
3143 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
3144 ? (val
| ~(mask
<< i
))
3145 : (val
& (mask
<< i
)))));
3146 for (i
+= 16; i
< 64; i
+= 16)
3148 if ((val2
& (mask
<< i
)) == 0)
3151 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
3152 GEN_INT ((val
>> i
) & 0xffff)));
3159 /* Return whether imm is a 128-bit immediate which is simple enough to
3162 aarch64_mov128_immediate (rtx imm
)
3164 if (GET_CODE (imm
) == CONST_INT
)
3167 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
3169 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
3170 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
3172 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
3173 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
3177 /* Return the number of temporary registers that aarch64_add_offset_1
3178 would need to add OFFSET to a register. */
3181 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
3183 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
3186 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3187 a non-polynomial OFFSET. MODE is the mode of the addition.
3188 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3189 be set and CFA adjustments added to the generated instructions.
3191 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3192 temporary if register allocation is already complete. This temporary
3193 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3194 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3195 the immediate again.
3197 Since this function may be used to adjust the stack pointer, we must
3198 ensure that it cannot cause transient stack deallocation (for example
3199 by first incrementing SP and then decrementing when adjusting by a
3200 large immediate). */
3203 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
3204 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
3205 bool frame_related_p
, bool emit_move_imm
)
3207 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3208 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3210 HOST_WIDE_INT moffset
= abs_hwi (offset
);
3215 if (!rtx_equal_p (dest
, src
))
3217 insn
= emit_insn (gen_rtx_SET (dest
, src
));
3218 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3223 /* Single instruction adjustment. */
3224 if (aarch64_uimm12_shift (moffset
))
3226 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
3227 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3231 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3234 a) the offset cannot be loaded by a 16-bit move or
3235 b) there is no spare register into which we can move it. */
3236 if (moffset
< 0x1000000
3237 && ((!temp1
&& !can_create_pseudo_p ())
3238 || !aarch64_move_imm (moffset
, mode
)))
3240 HOST_WIDE_INT low_off
= moffset
& 0xfff;
3242 low_off
= offset
< 0 ? -low_off
: low_off
;
3243 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
3244 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3245 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
3246 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3250 /* Emit a move immediate if required and an addition/subtraction. */
3253 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
3254 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
3256 insn
= emit_insn (offset
< 0
3257 ? gen_sub3_insn (dest
, src
, temp1
)
3258 : gen_add3_insn (dest
, src
, temp1
));
3259 if (frame_related_p
)
3261 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3262 rtx adj
= plus_constant (mode
, src
, offset
);
3263 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
3267 /* Return the number of temporary registers that aarch64_add_offset
3268 would need to move OFFSET into a register or add OFFSET to a register;
3269 ADD_P is true if we want the latter rather than the former. */
3272 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
3274 /* This follows the same structure as aarch64_add_offset. */
3275 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3278 unsigned int count
= 0;
3279 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3280 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3281 poly_int64
poly_offset (factor
, factor
);
3282 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3283 /* Need one register for the ADDVL/ADDPL result. */
3285 else if (factor
!= 0)
3287 factor
= abs (factor
);
3288 if (factor
> 16 * (factor
& -factor
))
3289 /* Need one register for the CNT result and one for the multiplication
3290 factor. If necessary, the second temporary can be reused for the
3291 constant part of the offset. */
3293 /* Need one register for the CNT result (which might then
3297 return count
+ aarch64_add_offset_1_temporaries (constant
);
3300 /* If X can be represented as a poly_int64, return the number
3301 of temporaries that are required to add it to a register.
3302 Return -1 otherwise. */
3305 aarch64_add_offset_temporaries (rtx x
)
3308 if (!poly_int_rtx_p (x
, &offset
))
3310 return aarch64_offset_temporaries (true, offset
);
3313 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3314 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3315 be set and CFA adjustments added to the generated instructions.
3317 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3318 temporary if register allocation is already complete. This temporary
3319 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3320 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3321 false to avoid emitting the immediate again.
3323 TEMP2, if nonnull, is a second temporary register that doesn't
3324 overlap either DEST or REG.
3326 Since this function may be used to adjust the stack pointer, we must
3327 ensure that it cannot cause transient stack deallocation (for example
3328 by first incrementing SP and then decrementing when adjusting by a
3329 large immediate). */
3332 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3333 poly_int64 offset
, rtx temp1
, rtx temp2
,
3334 bool frame_related_p
, bool emit_move_imm
= true)
3336 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
3337 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
3338 gcc_assert (temp1
== NULL_RTX
3340 || !reg_overlap_mentioned_p (temp1
, dest
));
3341 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
3343 /* Try using ADDVL or ADDPL to add the whole value. */
3344 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
3346 rtx offset_rtx
= gen_int_mode (offset
, mode
);
3347 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3348 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
3352 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3353 SVE vector register, over and above the minimum size of 128 bits.
3354 This is equivalent to half the value returned by CNTD with a
3355 vector shape of ALL. */
3356 HOST_WIDE_INT factor
= offset
.coeffs
[1];
3357 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
3359 /* Try using ADDVL or ADDPL to add the VG-based part. */
3360 poly_int64
poly_offset (factor
, factor
);
3361 if (src
!= const0_rtx
3362 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
3364 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
3365 if (frame_related_p
)
3367 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
3368 RTX_FRAME_RELATED_P (insn
) = true;
3373 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
3374 src
= aarch64_force_temporary (mode
, temp1
, addr
);
3379 /* Otherwise use a CNT-based sequence. */
3380 else if (factor
!= 0)
3382 /* Use a subtraction if we have a negative factor. */
3383 rtx_code code
= PLUS
;
3390 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3391 into the multiplication. */
3395 /* Use a right shift by 1. */
3399 HOST_WIDE_INT low_bit
= factor
& -factor
;
3400 if (factor
<= 16 * low_bit
)
3402 if (factor
> 16 * 8)
3404 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3405 the value with the minimum multiplier and shift it into
3407 int extra_shift
= exact_log2 (low_bit
);
3408 shift
+= extra_shift
;
3409 factor
>>= extra_shift
;
3411 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
3415 /* Use CNTD, then multiply it by FACTOR. */
3416 val
= gen_int_mode (poly_int64 (2, 2), mode
);
3417 val
= aarch64_force_temporary (mode
, temp1
, val
);
3419 /* Go back to using a negative multiplication factor if we have
3420 no register from which to subtract. */
3421 if (code
== MINUS
&& src
== const0_rtx
)
3426 rtx coeff1
= gen_int_mode (factor
, mode
);
3427 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
3428 val
= gen_rtx_MULT (mode
, val
, coeff1
);
3433 /* Multiply by 1 << SHIFT. */
3434 val
= aarch64_force_temporary (mode
, temp1
, val
);
3435 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
3437 else if (shift
== -1)
3440 val
= aarch64_force_temporary (mode
, temp1
, val
);
3441 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
3444 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3445 if (src
!= const0_rtx
)
3447 val
= aarch64_force_temporary (mode
, temp1
, val
);
3448 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
3450 else if (code
== MINUS
)
3452 val
= aarch64_force_temporary (mode
, temp1
, val
);
3453 val
= gen_rtx_NEG (mode
, val
);
3456 if (constant
== 0 || frame_related_p
)
3458 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
3459 if (frame_related_p
)
3461 RTX_FRAME_RELATED_P (insn
) = true;
3462 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
3463 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
3472 src
= aarch64_force_temporary (mode
, temp1
, val
);
3477 emit_move_imm
= true;
3480 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
3481 frame_related_p
, emit_move_imm
);
3484 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3485 than a poly_int64. */
3488 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
3489 rtx offset_rtx
, rtx temp1
, rtx temp2
)
3491 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
3492 temp1
, temp2
, false);
3495 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3496 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3497 if TEMP1 already contains abs (DELTA). */
3500 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
3502 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
3503 temp1
, temp2
, true, emit_move_imm
);
3506 /* Subtract DELTA from the stack pointer, marking the instructions
3507 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3511 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
,
3512 bool emit_move_imm
= true)
3514 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
3515 temp1
, temp2
, frame_related_p
, emit_move_imm
);
3518 /* Set DEST to (vec_series BASE STEP). */
3521 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
3523 machine_mode mode
= GET_MODE (dest
);
3524 scalar_mode inner
= GET_MODE_INNER (mode
);
3526 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3527 if (!aarch64_sve_index_immediate_p (base
))
3528 base
= force_reg (inner
, base
);
3529 if (!aarch64_sve_index_immediate_p (step
))
3530 step
= force_reg (inner
, step
);
3532 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
3535 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3536 register of mode MODE. Use TARGET for the result if it's nonnull
3539 The two vector modes must have the same element mode. The behavior
3540 is to duplicate architectural lane N of SRC into architectural lanes
3541 N + I * STEP of the result. On big-endian targets, architectural
3542 lane 0 of an Advanced SIMD vector is the last element of the vector
3543 in memory layout, so for big-endian targets this operation has the
3544 effect of reversing SRC before duplicating it. Callers need to
3545 account for this. */
3548 aarch64_expand_sve_dupq (rtx target
, machine_mode mode
, rtx src
)
3550 machine_mode src_mode
= GET_MODE (src
);
3551 gcc_assert (GET_MODE_INNER (mode
) == GET_MODE_INNER (src_mode
));
3552 insn_code icode
= (BYTES_BIG_ENDIAN
3553 ? code_for_aarch64_vec_duplicate_vq_be (mode
)
3554 : code_for_aarch64_vec_duplicate_vq_le (mode
));
3557 expand_operand ops
[3];
3558 create_output_operand (&ops
[i
++], target
, mode
);
3559 create_output_operand (&ops
[i
++], src
, src_mode
);
3560 if (BYTES_BIG_ENDIAN
)
3562 /* Create a PARALLEL describing the reversal of SRC. */
3563 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (mode
);
3564 rtx sel
= aarch64_gen_stepped_int_parallel (nelts_per_vq
,
3565 nelts_per_vq
- 1, -1);
3566 create_fixed_operand (&ops
[i
++], sel
);
3568 expand_insn (icode
, i
, ops
);
3569 return ops
[0].value
;
3572 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3573 the memory image into DEST. Return true on success. */
3576 aarch64_expand_sve_ld1rq (rtx dest
, rtx src
)
3578 src
= force_const_mem (GET_MODE (src
), src
);
3582 /* Make sure that the address is legitimate. */
3583 if (!aarch64_sve_ld1rq_operand_p (src
))
3585 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
3586 src
= replace_equiv_address (src
, addr
);
3589 machine_mode mode
= GET_MODE (dest
);
3590 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
3591 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
3592 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
3593 emit_insn (gen_aarch64_sve_ld1rq (mode
, dest
, src
, ptrue
));
3597 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3598 SVE data mode and isn't a legitimate constant. Use TARGET for the
3599 result if convenient.
3601 The returned register can have whatever mode seems most natural
3602 given the contents of SRC. */
3605 aarch64_expand_sve_const_vector (rtx target
, rtx src
)
3607 machine_mode mode
= GET_MODE (src
);
3608 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
3609 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
3610 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
3611 unsigned int elt_bits
= GET_MODE_BITSIZE (elt_mode
);
3612 unsigned int encoded_bits
= npatterns
* nelts_per_pattern
* elt_bits
;
3614 if (nelts_per_pattern
== 1 && encoded_bits
== 128)
3616 /* The constant is a duplicated quadword but can't be narrowed
3617 beyond a quadword. Get the memory image of the first quadword
3618 as a 128-bit vector and try using LD1RQ to load it from memory.
3620 The effect for both endiannesses is to load memory lane N into
3621 architectural lanes N + I * STEP of the result. On big-endian
3622 targets, the layout of the 128-bit vector in an Advanced SIMD
3623 register would be different from its layout in an SVE register,
3624 but this 128-bit vector is a memory value only. */
3625 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3626 rtx vq_value
= simplify_gen_subreg (vq_mode
, src
, mode
, 0);
3627 if (vq_value
&& aarch64_expand_sve_ld1rq (target
, vq_value
))
3631 if (nelts_per_pattern
== 1 && encoded_bits
< 128)
3633 /* The vector is a repeating sequence of 64 bits or fewer.
3634 See if we can load them using an Advanced SIMD move and then
3635 duplicate it to fill a vector. This is better than using a GPR
3636 move because it keeps everything in the same register file. */
3637 machine_mode vq_mode
= aarch64_vq_mode (elt_mode
).require ();
3638 rtx_vector_builder
builder (vq_mode
, npatterns
, 1);
3639 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3641 /* We want memory lane N to go into architectural lane N,
3642 so reverse for big-endian targets. The DUP .Q pattern
3643 has a compensating reverse built-in. */
3644 unsigned int srci
= BYTES_BIG_ENDIAN
? npatterns
- i
- 1 : i
;
3645 builder
.quick_push (CONST_VECTOR_ENCODED_ELT (src
, srci
));
3647 rtx vq_src
= builder
.build ();
3648 if (aarch64_simd_valid_immediate (vq_src
, NULL
))
3650 vq_src
= force_reg (vq_mode
, vq_src
);
3651 return aarch64_expand_sve_dupq (target
, mode
, vq_src
);
3654 /* Get an integer representation of the repeating part of Advanced
3655 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3656 which for big-endian targets is lane-swapped wrt a normal
3657 Advanced SIMD vector. This means that for both endiannesses,
3658 memory lane N of SVE vector SRC corresponds to architectural
3659 lane N of a register holding VQ_SRC. This in turn means that
3660 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3661 as a single 128-bit value) and thus that memory lane 0 of SRC is
3662 in the lsb of the integer. Duplicating the integer therefore
3663 ensures that memory lane N of SRC goes into architectural lane
3664 N + I * INDEX of the SVE register. */
3665 scalar_mode int_mode
= int_mode_for_size (encoded_bits
, 0).require ();
3666 rtx elt_value
= simplify_gen_subreg (int_mode
, vq_src
, vq_mode
, 0);
3669 /* Pretend that we had a vector of INT_MODE to start with. */
3670 elt_mode
= int_mode
;
3671 mode
= aarch64_full_sve_mode (int_mode
).require ();
3673 /* If the integer can be moved into a general register by a
3674 single instruction, do that and duplicate the result. */
3675 if (CONST_INT_P (elt_value
)
3676 && aarch64_move_imm (INTVAL (elt_value
), elt_mode
))
3678 elt_value
= force_reg (elt_mode
, elt_value
);
3679 return expand_vector_broadcast (mode
, elt_value
);
3682 else if (npatterns
== 1)
3683 /* We're duplicating a single value, but can't do better than
3684 force it to memory and load from there. This handles things
3685 like symbolic constants. */
3686 elt_value
= CONST_VECTOR_ENCODED_ELT (src
, 0);
3690 /* Load the element from memory if we can, otherwise move it into
3691 a register and use a DUP. */
3692 rtx op
= force_const_mem (elt_mode
, elt_value
);
3694 op
= force_reg (elt_mode
, elt_value
);
3695 return expand_vector_broadcast (mode
, op
);
3699 /* Try using INDEX. */
3701 if (const_vec_series_p (src
, &base
, &step
))
3703 aarch64_expand_vec_series (target
, base
, step
);
3707 /* From here on, it's better to force the whole constant to memory
3709 if (GET_MODE_NUNITS (mode
).is_constant ())
3712 /* Expand each pattern individually. */
3713 gcc_assert (npatterns
> 1);
3714 rtx_vector_builder builder
;
3715 auto_vec
<rtx
, 16> vectors (npatterns
);
3716 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3718 builder
.new_vector (mode
, 1, nelts_per_pattern
);
3719 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
3720 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
3721 vectors
.quick_push (force_reg (mode
, builder
.build ()));
3724 /* Use permutes to interleave the separate vectors. */
3725 while (npatterns
> 1)
3728 for (unsigned int i
= 0; i
< npatterns
; ++i
)
3730 rtx tmp
= (npatterns
== 1 ? target
: gen_reg_rtx (mode
));
3731 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
3732 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
3736 gcc_assert (vectors
[0] == target
);
3740 /* Use WHILE to set a predicate register of mode MODE in which the first
3741 VL bits are set and the rest are clear. Use TARGET for the register
3742 if it's nonnull and convenient. */
3745 aarch64_sve_move_pred_via_while (rtx target
, machine_mode mode
,
3748 rtx limit
= force_reg (DImode
, gen_int_mode (vl
, DImode
));
3749 target
= aarch64_target_reg (target
, mode
);
3750 emit_insn (gen_while_ult (DImode
, mode
, target
, const0_rtx
, limit
));
3755 aarch64_expand_sve_const_pred_1 (rtx
, rtx_vector_builder
&, bool);
3757 /* BUILDER is a constant predicate in which the index of every set bit
3758 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3759 by inverting every element at a multiple of ELT_SIZE and EORing the
3760 result with an ELT_SIZE PTRUE.
3762 Return a register that contains the constant on success, otherwise
3763 return null. Use TARGET as the register if it is nonnull and
3767 aarch64_expand_sve_const_pred_eor (rtx target
, rtx_vector_builder
&builder
,
3768 unsigned int elt_size
)
3770 /* Invert every element at a multiple of ELT_SIZE, keeping the
3772 rtx_vector_builder
inv_builder (VNx16BImode
, builder
.npatterns (),
3773 builder
.nelts_per_pattern ());
3774 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3775 if ((i
& (elt_size
- 1)) == 0 && INTVAL (builder
.elt (i
)) == 0)
3776 inv_builder
.quick_push (const1_rtx
);
3778 inv_builder
.quick_push (const0_rtx
);
3779 inv_builder
.finalize ();
3781 /* See if we can load the constant cheaply. */
3782 rtx inv
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, inv_builder
, false);
3786 /* EOR the result with an ELT_SIZE PTRUE. */
3787 rtx mask
= aarch64_ptrue_all (elt_size
);
3788 mask
= force_reg (VNx16BImode
, mask
);
3789 target
= aarch64_target_reg (target
, VNx16BImode
);
3790 emit_insn (gen_aarch64_pred_z (XOR
, VNx16BImode
, target
, mask
, inv
, mask
));
3794 /* BUILDER is a constant predicate in which the index of every set bit
3795 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3796 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3797 register on success, otherwise return null. Use TARGET as the register
3798 if nonnull and convenient. */
3801 aarch64_expand_sve_const_pred_trn (rtx target
, rtx_vector_builder
&builder
,
3802 unsigned int elt_size
,
3803 unsigned int permute_size
)
3805 /* We're going to split the constant into two new constants A and B,
3806 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3807 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3809 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3810 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3812 where _ indicates elements that will be discarded by the permute.
3814 First calculate the ELT_SIZEs for A and B. */
3815 unsigned int a_elt_size
= GET_MODE_SIZE (DImode
);
3816 unsigned int b_elt_size
= GET_MODE_SIZE (DImode
);
3817 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); i
+= elt_size
)
3818 if (INTVAL (builder
.elt (i
)) != 0)
3820 if (i
& permute_size
)
3821 b_elt_size
|= i
- permute_size
;
3825 a_elt_size
&= -a_elt_size
;
3826 b_elt_size
&= -b_elt_size
;
3828 /* Now construct the vectors themselves. */
3829 rtx_vector_builder
a_builder (VNx16BImode
, builder
.npatterns (),
3830 builder
.nelts_per_pattern ());
3831 rtx_vector_builder
b_builder (VNx16BImode
, builder
.npatterns (),
3832 builder
.nelts_per_pattern ());
3833 unsigned int nelts
= builder
.encoded_nelts ();
3834 for (unsigned int i
= 0; i
< nelts
; ++i
)
3835 if (i
& (elt_size
- 1))
3837 a_builder
.quick_push (const0_rtx
);
3838 b_builder
.quick_push (const0_rtx
);
3840 else if ((i
& permute_size
) == 0)
3842 /* The A and B elements are significant. */
3843 a_builder
.quick_push (builder
.elt (i
));
3844 b_builder
.quick_push (builder
.elt (i
+ permute_size
));
3848 /* The A and B elements are going to be discarded, so pick whatever
3849 is likely to give a nice constant. We are targeting element
3850 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3851 with the aim of each being a sequence of ones followed by
3852 a sequence of zeros. So:
3854 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3855 duplicate the last X_ELT_SIZE element, to extend the
3856 current sequence of ones or zeros.
3858 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3859 zero, so that the constant really does have X_ELT_SIZE and
3860 not a smaller size. */
3861 if (a_elt_size
> permute_size
)
3862 a_builder
.quick_push (const0_rtx
);
3864 a_builder
.quick_push (a_builder
.elt (i
- a_elt_size
));
3865 if (b_elt_size
> permute_size
)
3866 b_builder
.quick_push (const0_rtx
);
3868 b_builder
.quick_push (b_builder
.elt (i
- b_elt_size
));
3870 a_builder
.finalize ();
3871 b_builder
.finalize ();
3873 /* Try loading A into a register. */
3874 rtx_insn
*last
= get_last_insn ();
3875 rtx a
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, a_builder
, false);
3879 /* Try loading B into a register. */
3881 if (a_builder
!= b_builder
)
3883 b
= aarch64_expand_sve_const_pred_1 (NULL_RTX
, b_builder
, false);
3886 delete_insns_since (last
);
3891 /* Emit the TRN1 itself. */
3892 machine_mode mode
= aarch64_sve_pred_mode (permute_size
).require ();
3893 target
= aarch64_target_reg (target
, mode
);
3894 emit_insn (gen_aarch64_sve (UNSPEC_TRN1
, mode
, target
,
3895 gen_lowpart (mode
, a
),
3896 gen_lowpart (mode
, b
)));
3900 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3901 constant in BUILDER into an SVE predicate register. Return the register
3902 on success, otherwise return null. Use TARGET for the register if
3903 nonnull and convenient.
3905 ALLOW_RECURSE_P is true if we can use methods that would call this
3906 function recursively. */
3909 aarch64_expand_sve_const_pred_1 (rtx target
, rtx_vector_builder
&builder
,
3910 bool allow_recurse_p
)
3912 if (builder
.encoded_nelts () == 1)
3913 /* A PFALSE or a PTRUE .B ALL. */
3914 return aarch64_emit_set_immediate (target
, builder
);
3916 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
3917 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
3919 /* If we can load the constant using PTRUE, use it as-is. */
3920 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
3921 if (aarch64_svpattern_for_vl (mode
, vl
) != AARCH64_NUM_SVPATTERNS
)
3922 return aarch64_emit_set_immediate (target
, builder
);
3924 /* Otherwise use WHILE to set the first VL bits. */
3925 return aarch64_sve_move_pred_via_while (target
, mode
, vl
);
3928 if (!allow_recurse_p
)
3931 /* Try inverting the vector in element size ELT_SIZE and then EORing
3932 the result with an ELT_SIZE PTRUE. */
3933 if (INTVAL (builder
.elt (0)) == 0)
3934 if (rtx res
= aarch64_expand_sve_const_pred_eor (target
, builder
,
3938 /* Try using TRN1 to permute two simpler constants. */
3939 for (unsigned int i
= elt_size
; i
<= 8; i
*= 2)
3940 if (rtx res
= aarch64_expand_sve_const_pred_trn (target
, builder
,
3947 /* Return an SVE predicate register that contains the VNx16BImode
3948 constant in BUILDER, without going through the move expanders.
3950 The returned register can have whatever mode seems most natural
3951 given the contents of BUILDER. Use TARGET for the result if
3955 aarch64_expand_sve_const_pred (rtx target
, rtx_vector_builder
&builder
)
3957 /* Try loading the constant using pure predicate operations. */
3958 if (rtx res
= aarch64_expand_sve_const_pred_1 (target
, builder
, true))
3961 /* Try forcing the constant to memory. */
3962 if (builder
.full_nelts ().is_constant ())
3963 if (rtx mem
= force_const_mem (VNx16BImode
, builder
.build ()))
3965 target
= aarch64_target_reg (target
, VNx16BImode
);
3966 emit_move_insn (target
, mem
);
3970 /* The last resort is to load the constant as an integer and then
3971 compare it against zero. Use -1 for set bits in order to increase
3972 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
3973 rtx_vector_builder
int_builder (VNx16QImode
, builder
.npatterns (),
3974 builder
.nelts_per_pattern ());
3975 for (unsigned int i
= 0; i
< builder
.encoded_nelts (); ++i
)
3976 int_builder
.quick_push (INTVAL (builder
.elt (i
))
3977 ? constm1_rtx
: const0_rtx
);
3978 return aarch64_convert_sve_data_to_pred (target
, VNx16BImode
,
3979 int_builder
.build ());
3982 /* Set DEST to immediate IMM. */
3985 aarch64_expand_mov_immediate (rtx dest
, rtx imm
)
3987 machine_mode mode
= GET_MODE (dest
);
3989 /* Check on what type of symbol it is. */
3990 scalar_int_mode int_mode
;
3991 if ((GET_CODE (imm
) == SYMBOL_REF
3992 || GET_CODE (imm
) == LABEL_REF
3993 || GET_CODE (imm
) == CONST
3994 || GET_CODE (imm
) == CONST_POLY_INT
)
3995 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
3999 HOST_WIDE_INT const_offset
;
4000 enum aarch64_symbol_type sty
;
4002 /* If we have (const (plus symbol offset)), separate out the offset
4003 before we start classifying the symbol. */
4004 rtx base
= strip_offset (imm
, &offset
);
4006 /* We must always add an offset involving VL separately, rather than
4007 folding it into the relocation. */
4008 if (!offset
.is_constant (&const_offset
))
4010 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
4011 emit_insn (gen_rtx_SET (dest
, imm
));
4014 /* Do arithmetic on 32-bit values if the result is smaller
4016 if (partial_subreg_p (int_mode
, SImode
))
4018 /* It is invalid to do symbol calculations in modes
4019 narrower than SImode. */
4020 gcc_assert (base
== const0_rtx
);
4021 dest
= gen_lowpart (SImode
, dest
);
4024 if (base
!= const0_rtx
)
4026 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4027 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4028 NULL_RTX
, NULL_RTX
, false);
4031 aarch64_add_offset (int_mode
, dest
, base
, offset
,
4032 dest
, NULL_RTX
, false);
4037 sty
= aarch64_classify_symbol (base
, const_offset
);
4040 case SYMBOL_FORCE_TO_MEM
:
4041 if (const_offset
!= 0
4042 && targetm
.cannot_force_const_mem (int_mode
, imm
))
4044 gcc_assert (can_create_pseudo_p ());
4045 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4046 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4047 NULL_RTX
, NULL_RTX
, false);
4051 mem
= force_const_mem (ptr_mode
, imm
);
4054 /* If we aren't generating PC relative literals, then
4055 we need to expand the literal pool access carefully.
4056 This is something that needs to be done in a number
4057 of places, so could well live as a separate function. */
4058 if (!aarch64_pcrelative_literal_loads
)
4060 gcc_assert (can_create_pseudo_p ());
4061 base
= gen_reg_rtx (ptr_mode
);
4062 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
4063 if (ptr_mode
!= Pmode
)
4064 base
= convert_memory_address (Pmode
, base
);
4065 mem
= gen_rtx_MEM (ptr_mode
, base
);
4068 if (int_mode
!= ptr_mode
)
4069 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
4071 emit_insn (gen_rtx_SET (dest
, mem
));
4075 case SYMBOL_SMALL_TLSGD
:
4076 case SYMBOL_SMALL_TLSDESC
:
4077 case SYMBOL_SMALL_TLSIE
:
4078 case SYMBOL_SMALL_GOT_28K
:
4079 case SYMBOL_SMALL_GOT_4G
:
4080 case SYMBOL_TINY_GOT
:
4081 case SYMBOL_TINY_TLSIE
:
4082 if (const_offset
!= 0)
4084 gcc_assert(can_create_pseudo_p ());
4085 base
= aarch64_force_temporary (int_mode
, dest
, base
);
4086 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
4087 NULL_RTX
, NULL_RTX
, false);
4092 case SYMBOL_SMALL_ABSOLUTE
:
4093 case SYMBOL_TINY_ABSOLUTE
:
4094 case SYMBOL_TLSLE12
:
4095 case SYMBOL_TLSLE24
:
4096 case SYMBOL_TLSLE32
:
4097 case SYMBOL_TLSLE48
:
4098 aarch64_load_symref_appropriately (dest
, imm
, sty
);
4106 if (!CONST_INT_P (imm
))
4108 if (GET_MODE_CLASS (mode
) == MODE_VECTOR_BOOL
)
4110 /* Only the low bit of each .H, .S and .D element is defined,
4111 so we can set the upper bits to whatever we like. If the
4112 predicate is all-true in MODE, prefer to set all the undefined
4113 bits as well, so that we can share a single .B predicate for
4115 if (imm
== CONSTM1_RTX (mode
))
4116 imm
= CONSTM1_RTX (VNx16BImode
);
4118 /* All methods for constructing predicate modes wider than VNx16BI
4119 will set the upper bits of each element to zero. Expose this
4120 by moving such constants as a VNx16BI, so that all bits are
4121 significant and so that constants for different modes can be
4122 shared. The wider constant will still be available as a
4124 rtx_vector_builder builder
;
4125 if (aarch64_get_sve_pred_bits (builder
, imm
))
4127 rtx res
= aarch64_expand_sve_const_pred (dest
, builder
);
4129 emit_move_insn (dest
, gen_lowpart (mode
, res
));
4134 if (GET_CODE (imm
) == HIGH
4135 || aarch64_simd_valid_immediate (imm
, NULL
))
4137 emit_insn (gen_rtx_SET (dest
, imm
));
4141 if (GET_CODE (imm
) == CONST_VECTOR
&& aarch64_sve_data_mode_p (mode
))
4142 if (rtx res
= aarch64_expand_sve_const_vector (dest
, imm
))
4145 emit_insn (gen_aarch64_sve_reinterpret (mode
, dest
, res
));
4149 rtx mem
= force_const_mem (mode
, imm
);
4151 emit_move_insn (dest
, mem
);
4155 aarch64_internal_mov_immediate (dest
, imm
, true,
4156 as_a
<scalar_int_mode
> (mode
));
4159 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4160 that is known to contain PTRUE. */
4163 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
4165 expand_operand ops
[3];
4166 machine_mode mode
= GET_MODE (dest
);
4167 create_output_operand (&ops
[0], dest
, mode
);
4168 create_input_operand (&ops
[1], pred
, GET_MODE(pred
));
4169 create_input_operand (&ops
[2], src
, mode
);
4170 temporary_volatile_ok
v (true);
4171 expand_insn (code_for_aarch64_pred_mov (mode
), 3, ops
);
4174 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4175 operand is in memory. In this case we need to use the predicated LD1
4176 and ST1 instead of LDR and STR, both for correctness on big-endian
4177 targets and because LD1 and ST1 support a wider range of addressing modes.
4178 PRED_MODE is the mode of the predicate.
4180 See the comment at the head of aarch64-sve.md for details about the
4181 big-endian handling. */
4184 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
4186 machine_mode mode
= GET_MODE (dest
);
4187 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
4188 if (!register_operand (src
, mode
)
4189 && !register_operand (dest
, mode
))
4191 rtx tmp
= gen_reg_rtx (mode
);
4193 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
4195 emit_move_insn (tmp
, src
);
4198 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
4201 /* Called only on big-endian targets. See whether an SVE vector move
4202 from SRC to DEST is effectively a REV[BHW] instruction, because at
4203 least one operand is a subreg of an SVE vector that has wider or
4204 narrower elements. Return true and emit the instruction if so.
4208 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4210 represents a VIEW_CONVERT between the following vectors, viewed
4213 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4214 R1: { [0], [1], [2], [3], ... }
4216 The high part of lane X in R2 should therefore correspond to lane X*2
4217 of R1, but the register representations are:
4220 R2: ...... [1].high [1].low [0].high [0].low
4221 R1: ...... [3] [2] [1] [0]
4223 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4224 We therefore need a reverse operation to swap the high and low values
4227 This is purely an optimization. Without it we would spill the
4228 subreg operand to the stack in one mode and reload it in the
4229 other mode, which has the same effect as the REV. */
4232 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
4234 gcc_assert (BYTES_BIG_ENDIAN
);
4235 if (GET_CODE (dest
) == SUBREG
)
4236 dest
= SUBREG_REG (dest
);
4237 if (GET_CODE (src
) == SUBREG
)
4238 src
= SUBREG_REG (src
);
4240 /* The optimization handles two single SVE REGs with different element
4244 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
4245 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
4246 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
4247 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
4250 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4251 rtx ptrue
= aarch64_ptrue_reg (VNx16BImode
);
4252 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
4254 emit_insn (gen_rtx_SET (dest
, unspec
));
4258 /* Return a copy of X with mode MODE, without changing its other
4259 attributes. Unlike gen_lowpart, this doesn't care whether the
4260 mode change is valid. */
4263 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
4265 if (GET_MODE (x
) == mode
)
4268 x
= shallow_copy_rtx (x
);
4269 set_mode_and_regno (x
, mode
, REGNO (x
));
4273 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4277 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
4279 /* Decide which REV operation we need. The mode with narrower elements
4280 determines the mode of the operands and the mode with the wider
4281 elements determines the reverse width. */
4282 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
4283 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
4284 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
4285 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
4286 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
4288 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
4289 unsigned int unspec
;
4290 if (wider_bytes
== 8)
4291 unspec
= UNSPEC_REV64
;
4292 else if (wider_bytes
== 4)
4293 unspec
= UNSPEC_REV32
;
4294 else if (wider_bytes
== 2)
4295 unspec
= UNSPEC_REV16
;
4298 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
4302 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] UNSPEC_PRED_X))
4304 with the appropriate modes. */
4305 ptrue
= gen_lowpart (pred_mode
, ptrue
);
4306 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
4307 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
4308 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
4309 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
4311 emit_insn (gen_rtx_SET (dest
, src
));
4315 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
4316 tree exp ATTRIBUTE_UNUSED
)
4318 if (aarch64_simd_decl_p (cfun
->decl
) != aarch64_simd_decl_p (decl
))
4324 /* Implement TARGET_PASS_BY_REFERENCE. */
4327 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
4330 bool named ATTRIBUTE_UNUSED
)
4333 machine_mode dummymode
;
4336 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4337 if (mode
== BLKmode
&& type
)
4338 size
= int_size_in_bytes (type
);
4340 /* No frontends can create types with variable-sized modes, so we
4341 shouldn't be asked to pass or return them. */
4342 size
= GET_MODE_SIZE (mode
).to_constant ();
4344 /* Aggregates are passed by reference based on their size. */
4345 if (type
&& AGGREGATE_TYPE_P (type
))
4347 size
= int_size_in_bytes (type
);
4350 /* Variable sized arguments are always returned by reference. */
4354 /* Can this be a candidate to be passed in fp/simd register(s)? */
4355 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4360 /* Arguments which are variable sized or larger than 2 registers are
4361 passed by reference unless they are a homogenous floating point
4363 return size
> 2 * UNITS_PER_WORD
;
4366 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4368 aarch64_return_in_msb (const_tree valtype
)
4370 machine_mode dummy_mode
;
4373 /* Never happens in little-endian mode. */
4374 if (!BYTES_BIG_ENDIAN
)
4377 /* Only composite types smaller than or equal to 16 bytes can
4378 be potentially returned in registers. */
4379 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
4380 || int_size_in_bytes (valtype
) <= 0
4381 || int_size_in_bytes (valtype
) > 16)
4384 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4385 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4386 is always passed/returned in the least significant bits of fp/simd
4388 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
4389 &dummy_mode
, &dummy_int
, NULL
))
4395 /* Implement TARGET_FUNCTION_VALUE.
4396 Define how to find the value returned by a function. */
4399 aarch64_function_value (const_tree type
, const_tree func
,
4400 bool outgoing ATTRIBUTE_UNUSED
)
4405 machine_mode ag_mode
;
4407 mode
= TYPE_MODE (type
);
4408 if (INTEGRAL_TYPE_P (type
))
4409 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
4411 if (aarch64_return_in_msb (type
))
4413 HOST_WIDE_INT size
= int_size_in_bytes (type
);
4415 if (size
% UNITS_PER_WORD
!= 0)
4417 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
4418 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
4422 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
4423 &ag_mode
, &count
, NULL
))
4425 if (!aarch64_composite_type_p (type
, mode
))
4427 gcc_assert (count
== 1 && mode
== ag_mode
);
4428 return gen_rtx_REG (mode
, V0_REGNUM
);
4435 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
4436 for (i
= 0; i
< count
; i
++)
4438 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
4439 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
4440 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4441 XVECEXP (par
, 0, i
) = tmp
;
4447 return gen_rtx_REG (mode
, R0_REGNUM
);
4450 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4451 Return true if REGNO is the number of a hard register in which the values
4452 of called function may come back. */
4455 aarch64_function_value_regno_p (const unsigned int regno
)
4457 /* Maximum of 16 bytes can be returned in the general registers. Examples
4458 of 16-byte return values are: 128-bit integers and 16-byte small
4459 structures (excluding homogeneous floating-point aggregates). */
4460 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
4463 /* Up to four fp/simd registers can return a function value, e.g. a
4464 homogeneous floating-point aggregate having four members. */
4465 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
4466 return TARGET_FLOAT
;
4471 /* Implement TARGET_RETURN_IN_MEMORY.
4473 If the type T of the result of a function is such that
4475 would require that arg be passed as a value in a register (or set of
4476 registers) according to the parameter passing rules, then the result
4477 is returned in the same registers as would be used for such an
4481 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
4484 machine_mode ag_mode
;
4487 if (!AGGREGATE_TYPE_P (type
)
4488 && TREE_CODE (type
) != COMPLEX_TYPE
4489 && TREE_CODE (type
) != VECTOR_TYPE
)
4490 /* Simple scalar types always returned in registers. */
4493 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
4500 /* Types larger than 2 registers returned in memory. */
4501 size
= int_size_in_bytes (type
);
4502 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
4506 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
4507 const_tree type
, int *nregs
)
4509 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4510 return aarch64_vfp_is_call_or_return_candidate (mode
,
4512 &pcum
->aapcs_vfp_rmode
,
4517 /* Given MODE and TYPE of a function argument, return the alignment in
4518 bits. The idea is to suppress any stronger alignment requested by
4519 the user and opt for the natural alignment (specified in AAPCS64 \S
4520 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4521 calculated in versions of GCC prior to GCC-9. This is a helper
4522 function for local use only. */
4525 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
,
4530 return GET_MODE_ALIGNMENT (mode
);
4532 if (integer_zerop (TYPE_SIZE (type
)))
4535 gcc_assert (TYPE_MODE (type
) == mode
);
4537 if (!AGGREGATE_TYPE_P (type
))
4538 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
4540 if (TREE_CODE (type
) == ARRAY_TYPE
)
4541 return TYPE_ALIGN (TREE_TYPE (type
));
4543 unsigned int alignment
= 0;
4544 unsigned int bitfield_alignment
= 0;
4545 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
4546 if (TREE_CODE (field
) == FIELD_DECL
)
4548 alignment
= std::max (alignment
, DECL_ALIGN (field
));
4549 if (DECL_BIT_FIELD_TYPE (field
))
4551 = std::max (bitfield_alignment
,
4552 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field
)));
4555 if (bitfield_alignment
> alignment
)
4558 return bitfield_alignment
;
4564 /* Layout a function argument according to the AAPCS64 rules. The rule
4565 numbers refer to the rule numbers in the AAPCS64. */
4568 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4570 bool named ATTRIBUTE_UNUSED
)
4572 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4573 int ncrn
, nvrn
, nregs
;
4574 bool allocate_ncrn
, allocate_nvrn
;
4578 /* We need to do this once per argument. */
4579 if (pcum
->aapcs_arg_processed
)
4582 pcum
->aapcs_arg_processed
= true;
4584 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4586 size
= int_size_in_bytes (type
);
4588 /* No frontends can create types with variable-sized modes, so we
4589 shouldn't be asked to pass or return them. */
4590 size
= GET_MODE_SIZE (mode
).to_constant ();
4591 size
= ROUND_UP (size
, UNITS_PER_WORD
);
4593 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
4594 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
4599 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4600 The following code thus handles passing by SIMD/FP registers first. */
4602 nvrn
= pcum
->aapcs_nvrn
;
4604 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4605 and homogenous short-vector aggregates (HVA). */
4609 aarch64_err_no_fpadvsimd (mode
);
4611 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
4613 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
4614 if (!aarch64_composite_type_p (type
, mode
))
4616 gcc_assert (nregs
== 1);
4617 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
4623 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4624 for (i
= 0; i
< nregs
; i
++)
4626 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
4627 V0_REGNUM
+ nvrn
+ i
);
4628 rtx offset
= gen_int_mode
4629 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
4630 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
4631 XVECEXP (par
, 0, i
) = tmp
;
4633 pcum
->aapcs_reg
= par
;
4639 /* C.3 NSRN is set to 8. */
4640 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
4645 ncrn
= pcum
->aapcs_ncrn
;
4646 nregs
= size
/ UNITS_PER_WORD
;
4648 /* C6 - C9. though the sign and zero extension semantics are
4649 handled elsewhere. This is the case where the argument fits
4650 entirely general registers. */
4651 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
4653 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
4655 /* C.8 if the argument has an alignment of 16 then the NGRN is
4656 rounded up to the next even number. */
4659 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4660 comparison is there because for > 16 * BITS_PER_UNIT
4661 alignment nregs should be > 2 and therefore it should be
4662 passed by reference rather than value. */
4663 && (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4664 == 16 * BITS_PER_UNIT
))
4666 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4667 inform (input_location
, "parameter passing for argument of type "
4668 "%qT changed in GCC 9.1", type
);
4670 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
4673 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4674 A reg is still generated for it, but the caller should be smart
4675 enough not to use it. */
4676 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
4677 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
4683 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
4684 for (i
= 0; i
< nregs
; i
++)
4686 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
4687 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
4688 GEN_INT (i
* UNITS_PER_WORD
));
4689 XVECEXP (par
, 0, i
) = tmp
;
4691 pcum
->aapcs_reg
= par
;
4694 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
4699 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
4701 /* The argument is passed on stack; record the needed number of words for
4702 this argument and align the total size if necessary. */
4704 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
4706 if (aarch64_function_arg_alignment (mode
, type
, &abi_break
)
4707 == 16 * BITS_PER_UNIT
)
4709 int new_size
= ROUND_UP (pcum
->aapcs_stack_size
, 16 / UNITS_PER_WORD
);
4710 if (pcum
->aapcs_stack_size
!= new_size
)
4712 if (abi_break
&& warn_psabi
&& currently_expanding_gimple_stmt
)
4713 inform (input_location
, "parameter passing for argument of type "
4714 "%qT changed in GCC 9.1", type
);
4715 pcum
->aapcs_stack_size
= new_size
;
4721 /* Implement TARGET_FUNCTION_ARG. */
4724 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
4725 const_tree type
, bool named
)
4727 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4728 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
4730 if (mode
== VOIDmode
)
4733 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4734 return pcum
->aapcs_reg
;
4738 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
4739 const_tree fntype ATTRIBUTE_UNUSED
,
4740 rtx libname ATTRIBUTE_UNUSED
,
4741 const_tree fndecl ATTRIBUTE_UNUSED
,
4742 unsigned n_named ATTRIBUTE_UNUSED
)
4744 pcum
->aapcs_ncrn
= 0;
4745 pcum
->aapcs_nvrn
= 0;
4746 pcum
->aapcs_nextncrn
= 0;
4747 pcum
->aapcs_nextnvrn
= 0;
4748 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
4749 pcum
->aapcs_reg
= NULL_RTX
;
4750 pcum
->aapcs_arg_processed
= false;
4751 pcum
->aapcs_stack_words
= 0;
4752 pcum
->aapcs_stack_size
= 0;
4755 && fndecl
&& TREE_PUBLIC (fndecl
)
4756 && fntype
&& fntype
!= error_mark_node
)
4758 const_tree type
= TREE_TYPE (fntype
);
4759 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
4760 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
4761 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
4762 &mode
, &nregs
, NULL
))
4763 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
4769 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
4774 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
4775 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
4777 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
4778 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
4779 != (pcum
->aapcs_stack_words
!= 0));
4780 pcum
->aapcs_arg_processed
= false;
4781 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
4782 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
4783 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
4784 pcum
->aapcs_stack_words
= 0;
4785 pcum
->aapcs_reg
= NULL_RTX
;
4790 aarch64_function_arg_regno_p (unsigned regno
)
4792 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
4793 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
4796 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4797 PARM_BOUNDARY bits of alignment, but will be given anything up
4798 to STACK_BOUNDARY bits if the type requires it. This makes sure
4799 that both before and after the layout of each argument, the Next
4800 Stacked Argument Address (NSAA) will have a minimum alignment of
4804 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
4807 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
,
4809 if (abi_break
& warn_psabi
)
4810 inform (input_location
, "parameter passing for argument of type "
4811 "%qT changed in GCC 9.1", type
);
4813 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
4816 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4818 static fixed_size_mode
4819 aarch64_get_reg_raw_mode (int regno
)
4821 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
4822 /* Don't use the SVE part of the register for __builtin_apply and
4823 __builtin_return. The SVE registers aren't used by the normal PCS,
4824 so using them there would be a waste of time. The PCS extensions
4825 for SVE types are fundamentally incompatible with the
4826 __builtin_return/__builtin_apply interface. */
4827 return as_a
<fixed_size_mode
> (V16QImode
);
4828 return default_get_reg_raw_mode (regno
);
4831 /* Implement TARGET_FUNCTION_ARG_PADDING.
4833 Small aggregate types are placed in the lowest memory address.
4835 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4837 static pad_direction
4838 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
4840 /* On little-endian targets, the least significant byte of every stack
4841 argument is passed at the lowest byte address of the stack slot. */
4842 if (!BYTES_BIG_ENDIAN
)
4845 /* Otherwise, integral, floating-point and pointer types are padded downward:
4846 the least significant byte of a stack argument is passed at the highest
4847 byte address of the stack slot. */
4849 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
4850 || POINTER_TYPE_P (type
))
4851 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
4852 return PAD_DOWNWARD
;
4854 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4858 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4860 It specifies padding for the last (may also be the only)
4861 element of a block move between registers and memory. If
4862 assuming the block is in the memory, padding upward means that
4863 the last element is padded after its highest significant byte,
4864 while in downward padding, the last element is padded at the
4865 its least significant byte side.
4867 Small aggregates and small complex types are always padded
4870 We don't need to worry about homogeneous floating-point or
4871 short-vector aggregates; their move is not affected by the
4872 padding direction determined here. Regardless of endianness,
4873 each element of such an aggregate is put in the least
4874 significant bits of a fp/simd register.
4876 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4877 register has useful data, and return the opposite if the most
4878 significant byte does. */
4881 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
4882 bool first ATTRIBUTE_UNUSED
)
4885 /* Small composite types are always padded upward. */
4886 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
4890 size
= int_size_in_bytes (type
);
4892 /* No frontends can create types with variable-sized modes, so we
4893 shouldn't be asked to pass or return them. */
4894 size
= GET_MODE_SIZE (mode
).to_constant ();
4895 if (size
< 2 * UNITS_PER_WORD
)
4899 /* Otherwise, use the default padding. */
4900 return !BYTES_BIG_ENDIAN
;
4903 static scalar_int_mode
4904 aarch64_libgcc_cmp_return_mode (void)
4909 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4911 /* We use the 12-bit shifted immediate arithmetic instructions so values
4912 must be multiple of (1 << 12), i.e. 4096. */
4913 #define ARITH_FACTOR 4096
4915 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4916 #error Cannot use simple address calculation for stack probing
4919 /* The pair of scratch registers used for stack probing. */
4920 #define PROBE_STACK_FIRST_REG R9_REGNUM
4921 #define PROBE_STACK_SECOND_REG R10_REGNUM
4923 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4924 inclusive. These are offsets from the current stack pointer. */
4927 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
4930 if (!poly_size
.is_constant (&size
))
4932 sorry ("stack probes for SVE frames");
4936 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
4938 /* See the same assertion on PROBE_INTERVAL above. */
4939 gcc_assert ((first
% ARITH_FACTOR
) == 0);
4941 /* See if we have a constant small number of probes to generate. If so,
4942 that's the easy case. */
4943 if (size
<= PROBE_INTERVAL
)
4945 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
4947 emit_set_insn (reg1
,
4948 plus_constant (Pmode
,
4949 stack_pointer_rtx
, -(first
+ base
)));
4950 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
4953 /* The run-time loop is made up of 8 insns in the generic case while the
4954 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4955 else if (size
<= 4 * PROBE_INTERVAL
)
4957 HOST_WIDE_INT i
, rem
;
4959 emit_set_insn (reg1
,
4960 plus_constant (Pmode
,
4962 -(first
+ PROBE_INTERVAL
)));
4963 emit_stack_probe (reg1
);
4965 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4966 it exceeds SIZE. If only two probes are needed, this will not
4967 generate any code. Then probe at FIRST + SIZE. */
4968 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
4970 emit_set_insn (reg1
,
4971 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
4972 emit_stack_probe (reg1
);
4975 rem
= size
- (i
- PROBE_INTERVAL
);
4978 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
4980 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
4981 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
4984 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
4987 /* Otherwise, do the same as above, but in a loop. Note that we must be
4988 extra careful with variables wrapping around because we might be at
4989 the very top (or the very bottom) of the address space and we have
4990 to be able to handle this case properly; in particular, we use an
4991 equality test for the loop condition. */
4994 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
4996 /* Step 1: round SIZE to the previous multiple of the interval. */
4998 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
5001 /* Step 2: compute initial and final value of the loop counter. */
5003 /* TEST_ADDR = SP + FIRST. */
5004 emit_set_insn (reg1
,
5005 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
5007 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5008 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
5009 if (! aarch64_uimm12_shift (adjustment
))
5011 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
5013 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
5016 emit_set_insn (reg2
,
5017 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
5023 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5026 while (TEST_ADDR != LAST_ADDR)
5028 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5029 until it is equal to ROUNDED_SIZE. */
5031 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
5034 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5035 that SIZE is equal to ROUNDED_SIZE. */
5037 if (size
!= rounded_size
)
5039 HOST_WIDE_INT rem
= size
- rounded_size
;
5043 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
5045 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
5046 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
5049 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
5053 /* Make sure nothing is scheduled before we are done. */
5054 emit_insn (gen_blockage ());
5057 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5058 absolute addresses. */
5061 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
5063 static int labelno
= 0;
5067 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
5070 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
5072 HOST_WIDE_INT stack_clash_probe_interval
5073 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
5075 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5077 HOST_WIDE_INT interval
;
5078 if (flag_stack_clash_protection
)
5079 interval
= stack_clash_probe_interval
;
5081 interval
= PROBE_INTERVAL
;
5083 gcc_assert (aarch64_uimm12_shift (interval
));
5084 xops
[1] = GEN_INT (interval
);
5086 output_asm_insn ("sub\t%0, %0, %1", xops
);
5088 /* If doing stack clash protection then we probe up by the ABI specified
5089 amount. We do this because we're dropping full pages at a time in the
5090 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5091 if (flag_stack_clash_protection
)
5092 xops
[1] = GEN_INT (STACK_CLASH_CALLER_GUARD
);
5094 xops
[1] = CONST0_RTX (GET_MODE (xops
[1]));
5096 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5097 by this amount for each iteration. */
5098 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5100 /* Test if TEST_ADDR == LAST_ADDR. */
5102 output_asm_insn ("cmp\t%0, %1", xops
);
5105 fputs ("\tb.ne\t", asm_out_file
);
5106 assemble_name_raw (asm_out_file
, loop_lab
);
5107 fputc ('\n', asm_out_file
);
5112 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5113 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5114 of GUARD_SIZE. When a probe is emitted it is done at most
5115 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5116 at most MIN_PROBE_THRESHOLD. By the end of this function
5117 BASE = BASE - ADJUSTMENT. */
5120 aarch64_output_probe_sve_stack_clash (rtx base
, rtx adjustment
,
5121 rtx min_probe_threshold
, rtx guard_size
)
5123 /* This function is not allowed to use any instruction generation function
5124 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5125 so instead emit the code you want using output_asm_insn. */
5126 gcc_assert (flag_stack_clash_protection
);
5127 gcc_assert (CONST_INT_P (min_probe_threshold
) && CONST_INT_P (guard_size
));
5128 gcc_assert (INTVAL (guard_size
) > INTVAL (min_probe_threshold
));
5130 /* The minimum required allocation before the residual requires probing. */
5131 HOST_WIDE_INT residual_probe_guard
= INTVAL (min_probe_threshold
);
5133 /* Clamp the value down to the nearest value that can be used with a cmp. */
5134 residual_probe_guard
= aarch64_clamp_to_uimm12_shift (residual_probe_guard
);
5135 rtx probe_offset_value_rtx
= gen_int_mode (residual_probe_guard
, Pmode
);
5137 gcc_assert (INTVAL (min_probe_threshold
) >= residual_probe_guard
);
5138 gcc_assert (aarch64_uimm12_shift (residual_probe_guard
));
5140 static int labelno
= 0;
5141 char loop_start_lab
[32];
5142 char loop_end_lab
[32];
5145 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab
, "SVLPSPL", labelno
);
5146 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab
, "SVLPEND", labelno
++);
5148 /* Emit loop start label. */
5149 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_start_lab
);
5151 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5152 xops
[0] = adjustment
;
5153 xops
[1] = probe_offset_value_rtx
;
5154 output_asm_insn ("cmp\t%0, %1", xops
);
5156 /* Branch to end if not enough adjustment to probe. */
5157 fputs ("\tb.lt\t", asm_out_file
);
5158 assemble_name_raw (asm_out_file
, loop_end_lab
);
5159 fputc ('\n', asm_out_file
);
5161 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5163 xops
[1] = probe_offset_value_rtx
;
5164 output_asm_insn ("sub\t%0, %0, %1", xops
);
5166 /* Probe at BASE. */
5167 xops
[1] = const0_rtx
;
5168 output_asm_insn ("str\txzr, [%0, %1]", xops
);
5170 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5171 xops
[0] = adjustment
;
5172 xops
[1] = probe_offset_value_rtx
;
5173 output_asm_insn ("sub\t%0, %0, %1", xops
);
5175 /* Branch to start if still more bytes to allocate. */
5176 fputs ("\tb\t", asm_out_file
);
5177 assemble_name_raw (asm_out_file
, loop_start_lab
);
5178 fputc ('\n', asm_out_file
);
5180 /* No probe leave. */
5181 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_end_lab
);
5183 /* BASE = BASE - ADJUSTMENT. */
5185 xops
[1] = adjustment
;
5186 output_asm_insn ("sub\t%0, %0, %1", xops
);
5190 /* Determine whether a frame chain needs to be generated. */
5192 aarch64_needs_frame_chain (void)
5194 /* Force a frame chain for EH returns so the return address is at FP+8. */
5195 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
5198 /* A leaf function cannot have calls or write LR. */
5199 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
5201 /* Don't use a frame chain in leaf functions if leaf frame pointers
5203 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
5206 return aarch64_use_frame_pointer
;
5209 /* Mark the registers that need to be saved by the callee and calculate
5210 the size of the callee-saved registers area and frame record (both FP
5211 and LR may be omitted). */
5213 aarch64_layout_frame (void)
5215 HOST_WIDE_INT offset
= 0;
5216 int regno
, last_fp_reg
= INVALID_REGNUM
;
5217 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5219 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
5221 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5222 the mid-end is doing. */
5223 crtl
->outgoing_args_size
= STACK_DYNAMIC_OFFSET (cfun
);
5225 #define SLOT_NOT_REQUIRED (-2)
5226 #define SLOT_REQUIRED (-1)
5228 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
5229 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
5231 /* If this is a non-leaf simd function with calls we assume that
5232 at least one of those calls is to a non-simd function and thus
5233 we must save V8 to V23 in the prologue. */
5235 if (simd_function
&& !crtl
->is_leaf
)
5237 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5238 if (FP_SIMD_SAVED_REGNUM_P (regno
))
5239 df_set_regs_ever_live (regno
, true);
5242 /* First mark all the registers that really need to be saved... */
5243 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5244 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5246 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5247 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
5249 /* ... that includes the eh data registers (if needed)... */
5250 if (crtl
->calls_eh_return
)
5251 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
5252 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
5255 /* ... and any callee saved register that dataflow says is live. */
5256 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5257 if (df_regs_ever_live_p (regno
)
5258 && (regno
== R30_REGNUM
5259 || !call_used_regs
[regno
]))
5260 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5262 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5263 if (df_regs_ever_live_p (regno
)
5264 && (!call_used_regs
[regno
]
5265 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
))))
5267 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
5268 last_fp_reg
= regno
;
5271 if (cfun
->machine
->frame
.emit_frame_chain
)
5273 /* FP and LR are placed in the linkage record. */
5274 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
5275 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
5276 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
5277 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
5278 offset
= 2 * UNITS_PER_WORD
;
5281 /* With stack-clash, LR must be saved in non-leaf functions. */
5282 gcc_assert (crtl
->is_leaf
5283 || (cfun
->machine
->frame
.reg_offset
[R30_REGNUM
]
5284 != SLOT_NOT_REQUIRED
));
5286 /* Now assign stack slots for them. */
5287 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
5288 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5290 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5291 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5292 cfun
->machine
->frame
.wb_candidate1
= regno
;
5293 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
5294 cfun
->machine
->frame
.wb_candidate2
= regno
;
5295 offset
+= UNITS_PER_WORD
;
5298 HOST_WIDE_INT max_int_offset
= offset
;
5299 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5300 bool has_align_gap
= offset
!= max_int_offset
;
5302 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
5303 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
5305 /* If there is an alignment gap between integer and fp callee-saves,
5306 allocate the last fp register to it if possible. */
5307 if (regno
== last_fp_reg
5310 && (offset
& 8) == 0)
5312 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
5316 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
5317 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
5318 cfun
->machine
->frame
.wb_candidate1
= regno
;
5319 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
5320 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
5321 cfun
->machine
->frame
.wb_candidate2
= regno
;
5322 offset
+= simd_function
? UNITS_PER_VREG
: UNITS_PER_WORD
;
5325 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
5327 cfun
->machine
->frame
.saved_regs_size
= offset
;
5329 HOST_WIDE_INT varargs_and_saved_regs_size
5330 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
5332 cfun
->machine
->frame
.hard_fp_offset
5333 = aligned_upper_bound (varargs_and_saved_regs_size
5334 + get_frame_size (),
5335 STACK_BOUNDARY
/ BITS_PER_UNIT
);
5337 /* Both these values are already aligned. */
5338 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
5339 STACK_BOUNDARY
/ BITS_PER_UNIT
));
5340 cfun
->machine
->frame
.frame_size
5341 = (cfun
->machine
->frame
.hard_fp_offset
5342 + crtl
->outgoing_args_size
);
5344 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
5346 cfun
->machine
->frame
.initial_adjust
= 0;
5347 cfun
->machine
->frame
.final_adjust
= 0;
5348 cfun
->machine
->frame
.callee_adjust
= 0;
5349 cfun
->machine
->frame
.callee_offset
= 0;
5351 HOST_WIDE_INT max_push_offset
= 0;
5352 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
5353 max_push_offset
= 512;
5354 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
5355 max_push_offset
= 256;
5357 HOST_WIDE_INT const_size
, const_fp_offset
;
5358 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
5359 && const_size
< max_push_offset
5360 && known_eq (crtl
->outgoing_args_size
, 0))
5362 /* Simple, small frame with no outgoing arguments:
5363 stp reg1, reg2, [sp, -frame_size]!
5364 stp reg3, reg4, [sp, 16] */
5365 cfun
->machine
->frame
.callee_adjust
= const_size
;
5367 else if (known_lt (crtl
->outgoing_args_size
5368 + cfun
->machine
->frame
.saved_regs_size
, 512)
5369 && !(cfun
->calls_alloca
5370 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
5373 /* Frame with small outgoing arguments:
5374 sub sp, sp, frame_size
5375 stp reg1, reg2, [sp, outgoing_args_size]
5376 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5377 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
5378 cfun
->machine
->frame
.callee_offset
5379 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
5381 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
5382 && const_fp_offset
< max_push_offset
)
5384 /* Frame with large outgoing arguments but a small local area:
5385 stp reg1, reg2, [sp, -hard_fp_offset]!
5386 stp reg3, reg4, [sp, 16]
5387 sub sp, sp, outgoing_args_size */
5388 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
5389 cfun
->machine
->frame
.final_adjust
5390 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
5394 /* Frame with large local area and outgoing arguments using frame pointer:
5395 sub sp, sp, hard_fp_offset
5396 stp x29, x30, [sp, 0]
5398 stp reg3, reg4, [sp, 16]
5399 sub sp, sp, outgoing_args_size */
5400 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
5401 cfun
->machine
->frame
.final_adjust
5402 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
5405 cfun
->machine
->frame
.laid_out
= true;
5408 /* Return true if the register REGNO is saved on entry to
5409 the current function. */
5412 aarch64_register_saved_on_entry (int regno
)
5414 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
5417 /* Return the next register up from REGNO up to LIMIT for the callee
5421 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
5423 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
5428 /* Push the register number REGNO of mode MODE to the stack with write-back
5429 adjusting the stack by ADJUSTMENT. */
5432 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
5433 HOST_WIDE_INT adjustment
)
5435 rtx base_rtx
= stack_pointer_rtx
;
5438 reg
= gen_rtx_REG (mode
, regno
);
5439 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
5440 plus_constant (Pmode
, base_rtx
, -adjustment
));
5441 mem
= gen_frame_mem (mode
, mem
);
5443 insn
= emit_move_insn (mem
, reg
);
5444 RTX_FRAME_RELATED_P (insn
) = 1;
5447 /* Generate and return an instruction to store the pair of registers
5448 REG and REG2 of mode MODE to location BASE with write-back adjusting
5449 the stack location BASE by ADJUSTMENT. */
5452 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5453 HOST_WIDE_INT adjustment
)
5458 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
5459 GEN_INT (-adjustment
),
5460 GEN_INT (UNITS_PER_WORD
- adjustment
));
5462 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
5463 GEN_INT (-adjustment
),
5464 GEN_INT (UNITS_PER_WORD
- adjustment
));
5466 return gen_storewb_pairtf_di (base
, base
, reg
, reg2
,
5467 GEN_INT (-adjustment
),
5468 GEN_INT (UNITS_PER_VREG
- adjustment
));
5474 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5475 stack pointer by ADJUSTMENT. */
5478 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
5481 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5483 if (regno2
== INVALID_REGNUM
)
5484 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
5486 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5487 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5489 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
5491 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
5492 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5493 RTX_FRAME_RELATED_P (insn
) = 1;
5496 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5497 adjusting it by ADJUSTMENT afterwards. */
5500 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
5501 HOST_WIDE_INT adjustment
)
5506 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5507 GEN_INT (UNITS_PER_WORD
));
5509 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5510 GEN_INT (UNITS_PER_WORD
));
5512 return gen_loadwb_pairtf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
5513 GEN_INT (UNITS_PER_VREG
));
5519 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5520 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5524 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
5527 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno1
);
5528 rtx reg1
= gen_rtx_REG (mode
, regno1
);
5530 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
5532 if (regno2
== INVALID_REGNUM
)
5534 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
5535 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
5536 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
5540 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5541 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5542 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
5547 /* Generate and return a store pair instruction of mode MODE to store
5548 register REG1 to MEM1 and register REG2 to MEM2. */
5551 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
5557 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
5560 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
5563 return gen_store_pair_dw_tftf (mem1
, reg1
, mem2
, reg2
);
5570 /* Generate and regurn a load pair isntruction of mode MODE to load register
5571 REG1 from MEM1 and register REG2 from MEM2. */
5574 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
5580 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
5583 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
5586 return gen_load_pair_dw_tftf (reg1
, mem1
, reg2
, mem2
);
5593 /* Return TRUE if return address signing should be enabled for the current
5594 function, otherwise return FALSE. */
5597 aarch64_return_address_signing_enabled (void)
5599 /* This function should only be called after frame laid out. */
5600 gcc_assert (cfun
->machine
->frame
.laid_out
);
5602 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5603 if its LR is pushed onto stack. */
5604 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
5605 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
5606 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
5609 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5611 aarch64_bti_enabled (void)
5613 return (aarch64_enable_bti
== 1);
5616 /* Emit code to save the callee-saved registers from register number START
5617 to LIMIT to the stack at the location starting at offset START_OFFSET,
5618 skipping any write-back candidates if SKIP_WB is true. */
5621 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
5622 unsigned start
, unsigned limit
, bool skip_wb
)
5628 for (regno
= aarch64_next_callee_save (start
, limit
);
5630 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5637 && (regno
== cfun
->machine
->frame
.wb_candidate1
5638 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5641 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5644 reg
= gen_rtx_REG (mode
, regno
);
5645 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5646 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5649 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5650 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5651 - cfun
->machine
->frame
.reg_offset
[regno
];
5654 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5655 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5657 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5660 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5661 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
5663 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
5666 /* The first part of a frame-related parallel insn is
5667 always assumed to be relevant to the frame
5668 calculations; subsequent parts, are only
5669 frame-related if explicitly marked. */
5670 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
5674 insn
= emit_move_insn (mem
, reg
);
5676 RTX_FRAME_RELATED_P (insn
) = 1;
5680 /* Emit code to restore the callee registers of mode MODE from register
5681 number START up to and including LIMIT. Restore from the stack offset
5682 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5683 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5686 aarch64_restore_callee_saves (machine_mode mode
,
5687 poly_int64 start_offset
, unsigned start
,
5688 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
5690 rtx base_rtx
= stack_pointer_rtx
;
5695 for (regno
= aarch64_next_callee_save (start
, limit
);
5697 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
5699 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
5706 && (regno
== cfun
->machine
->frame
.wb_candidate1
5707 || regno
== cfun
->machine
->frame
.wb_candidate2
))
5710 reg
= gen_rtx_REG (mode
, regno
);
5711 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
5712 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5714 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
5715 offset_diff
= cfun
->machine
->frame
.reg_offset
[regno2
]
5716 - cfun
->machine
->frame
.reg_offset
[regno
];
5719 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
5720 && known_eq (GET_MODE_SIZE (mode
), offset_diff
))
5722 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5725 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
5726 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
5727 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5729 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
5733 emit_move_insn (reg
, mem
);
5734 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
5738 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5742 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5744 HOST_WIDE_INT multiple
;
5745 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5746 && IN_RANGE (multiple
, -8, 7));
5749 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5753 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5755 HOST_WIDE_INT multiple
;
5756 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5757 && IN_RANGE (multiple
, 0, 63));
5760 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5764 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5766 HOST_WIDE_INT multiple
;
5767 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5768 && IN_RANGE (multiple
, -64, 63));
5771 /* Return true if OFFSET is a signed 9-bit value. */
5774 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
5777 HOST_WIDE_INT const_offset
;
5778 return (offset
.is_constant (&const_offset
)
5779 && IN_RANGE (const_offset
, -256, 255));
5782 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5786 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
5788 HOST_WIDE_INT multiple
;
5789 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5790 && IN_RANGE (multiple
, -256, 255));
5793 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5797 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
5799 HOST_WIDE_INT multiple
;
5800 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
5801 && IN_RANGE (multiple
, 0, 4095));
5804 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5807 aarch64_get_separate_components (void)
5809 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5810 bitmap_clear (components
);
5812 /* The registers we need saved to the frame. */
5813 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5814 if (aarch64_register_saved_on_entry (regno
))
5816 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5817 if (!frame_pointer_needed
)
5818 offset
+= cfun
->machine
->frame
.frame_size
5819 - cfun
->machine
->frame
.hard_fp_offset
;
5820 /* Check that we can access the stack slot of the register with one
5821 direct load with no adjustments needed. */
5822 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
5823 bitmap_set_bit (components
, regno
);
5826 /* Don't mess with the hard frame pointer. */
5827 if (frame_pointer_needed
)
5828 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
5830 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
5831 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
5832 /* If registers have been chosen to be stored/restored with
5833 writeback don't interfere with them to avoid having to output explicit
5834 stack adjustment instructions. */
5835 if (reg2
!= INVALID_REGNUM
)
5836 bitmap_clear_bit (components
, reg2
);
5837 if (reg1
!= INVALID_REGNUM
)
5838 bitmap_clear_bit (components
, reg1
);
5840 bitmap_clear_bit (components
, LR_REGNUM
);
5841 bitmap_clear_bit (components
, SP_REGNUM
);
5846 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5849 aarch64_components_for_bb (basic_block bb
)
5851 bitmap in
= DF_LIVE_IN (bb
);
5852 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
5853 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
5854 bool simd_function
= aarch64_simd_decl_p (cfun
->decl
);
5856 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
5857 bitmap_clear (components
);
5859 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5860 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
5861 if ((!call_used_regs
[regno
]
5862 || (simd_function
&& FP_SIMD_SAVED_REGNUM_P (regno
)))
5863 && (bitmap_bit_p (in
, regno
)
5864 || bitmap_bit_p (gen
, regno
)
5865 || bitmap_bit_p (kill
, regno
)))
5867 unsigned regno2
, offset
, offset2
;
5868 bitmap_set_bit (components
, regno
);
5870 /* If there is a callee-save at an adjacent offset, add it too
5871 to increase the use of LDP/STP. */
5872 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5873 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
5875 if (regno2
<= LAST_SAVED_REGNUM
)
5877 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5878 if ((offset
& ~8) == (offset2
& ~8))
5879 bitmap_set_bit (components
, regno2
);
5886 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5887 Nothing to do for aarch64. */
5890 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
5894 /* Return the next set bit in BMP from START onwards. Return the total number
5895 of bits in BMP if no set bit is found at or after START. */
5898 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
5900 unsigned int nbits
= SBITMAP_SIZE (bmp
);
5904 gcc_assert (start
< nbits
);
5905 for (unsigned int i
= start
; i
< nbits
; i
++)
5906 if (bitmap_bit_p (bmp
, i
))
5912 /* Do the work for aarch64_emit_prologue_components and
5913 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5914 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5915 for these components or the epilogue sequence. That is, it determines
5916 whether we should emit stores or loads and what kind of CFA notes to attach
5917 to the insns. Otherwise the logic for the two sequences is very
5921 aarch64_process_components (sbitmap components
, bool prologue_p
)
5923 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
5924 ? HARD_FRAME_POINTER_REGNUM
5925 : STACK_POINTER_REGNUM
);
5927 unsigned last_regno
= SBITMAP_SIZE (components
);
5928 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
5929 rtx_insn
*insn
= NULL
;
5931 while (regno
!= last_regno
)
5933 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5934 so DFmode for the vector registers is enough. For simd functions
5935 we want to save the low 128 bits. */
5936 machine_mode mode
= aarch64_reg_save_mode (cfun
->decl
, regno
);
5938 rtx reg
= gen_rtx_REG (mode
, regno
);
5939 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
5940 if (!frame_pointer_needed
)
5941 offset
+= cfun
->machine
->frame
.frame_size
5942 - cfun
->machine
->frame
.hard_fp_offset
;
5943 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
5944 rtx mem
= gen_frame_mem (mode
, addr
);
5946 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
5947 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
5948 /* No more registers to handle after REGNO.
5949 Emit a single save/restore and exit. */
5950 if (regno2
== last_regno
)
5952 insn
= emit_insn (set
);
5953 RTX_FRAME_RELATED_P (insn
) = 1;
5955 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5957 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5961 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
5962 /* The next register is not of the same class or its offset is not
5963 mergeable with the current one into a pair. */
5964 if (!satisfies_constraint_Ump (mem
)
5965 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
5966 || (aarch64_simd_decl_p (cfun
->decl
) && FP_REGNUM_P (regno
))
5967 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
5968 GET_MODE_SIZE (mode
)))
5970 insn
= emit_insn (set
);
5971 RTX_FRAME_RELATED_P (insn
) = 1;
5973 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
5975 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
5981 /* REGNO2 can be saved/restored in a pair with REGNO. */
5982 rtx reg2
= gen_rtx_REG (mode
, regno2
);
5983 if (!frame_pointer_needed
)
5984 offset2
+= cfun
->machine
->frame
.frame_size
5985 - cfun
->machine
->frame
.hard_fp_offset
;
5986 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
5987 rtx mem2
= gen_frame_mem (mode
, addr2
);
5988 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
5989 : gen_rtx_SET (reg2
, mem2
);
5992 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
5994 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
5996 RTX_FRAME_RELATED_P (insn
) = 1;
5999 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
6000 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
6004 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
6005 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
6008 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
6012 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6015 aarch64_emit_prologue_components (sbitmap components
)
6017 aarch64_process_components (components
, true);
6020 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6023 aarch64_emit_epilogue_components (sbitmap components
)
6025 aarch64_process_components (components
, false);
6028 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6031 aarch64_set_handled_components (sbitmap components
)
6033 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
6034 if (bitmap_bit_p (components
, regno
))
6035 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
6038 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6039 determining the probe offset for alloca. */
6041 static HOST_WIDE_INT
6042 aarch64_stack_clash_protection_alloca_probe_range (void)
6044 return STACK_CLASH_CALLER_GUARD
;
6048 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6049 registers. If POLY_SIZE is not large enough to require a probe this function
6050 will only adjust the stack. When allocating the stack space
6051 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6052 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6053 arguments. If we are then we ensure that any allocation larger than the ABI
6054 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6057 We emit barriers after each stack adjustment to prevent optimizations from
6058 breaking the invariant that we never drop the stack more than a page. This
6059 invariant is needed to make it easier to correctly handle asynchronous
6060 events, e.g. if we were to allow the stack to be dropped by more than a page
6061 and then have multiple probes up and we take a signal somewhere in between
6062 then the signal handler doesn't know the state of the stack and can make no
6063 assumptions about which pages have been probed. */
6066 aarch64_allocate_and_probe_stack_space (rtx temp1
, rtx temp2
,
6067 poly_int64 poly_size
,
6068 bool frame_related_p
,
6069 bool final_adjustment_p
)
6071 HOST_WIDE_INT guard_size
6072 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6073 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6074 /* When doing the final adjustment for the outgoing argument size we can't
6075 assume that LR was saved at position 0. So subtract it's offset from the
6076 ABI safe buffer so that we don't accidentally allow an adjustment that
6077 would result in an allocation larger than the ABI buffer without
6079 HOST_WIDE_INT min_probe_threshold
6080 = final_adjustment_p
6081 ? guard_used_by_caller
- cfun
->machine
->frame
.reg_offset
[LR_REGNUM
]
6082 : guard_size
- guard_used_by_caller
;
6084 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6086 /* We should always have a positive probe threshold. */
6087 gcc_assert (min_probe_threshold
> 0);
6089 if (flag_stack_clash_protection
&& !final_adjustment_p
)
6091 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6092 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6094 if (known_eq (frame_size
, 0))
6096 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME
, false);
6098 else if (known_lt (initial_adjust
, guard_size
- guard_used_by_caller
)
6099 && known_lt (final_adjust
, guard_used_by_caller
))
6101 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME
, true);
6105 /* If SIZE is not large enough to require probing, just adjust the stack and
6107 if (known_lt (poly_size
, min_probe_threshold
)
6108 || !flag_stack_clash_protection
)
6110 aarch64_sub_sp (temp1
, temp2
, poly_size
, frame_related_p
);
6115 /* Handle the SVE non-constant case first. */
6116 if (!poly_size
.is_constant (&size
))
6120 fprintf (dump_file
, "Stack clash SVE prologue: ");
6121 print_dec (poly_size
, dump_file
);
6122 fprintf (dump_file
, " bytes, dynamic probing will be required.\n");
6125 /* First calculate the amount of bytes we're actually spilling. */
6126 aarch64_add_offset (Pmode
, temp1
, CONST0_RTX (Pmode
),
6127 poly_size
, temp1
, temp2
, false, true);
6129 rtx_insn
*insn
= get_last_insn ();
6131 if (frame_related_p
)
6133 /* This is done to provide unwinding information for the stack
6134 adjustments we're about to do, however to prevent the optimizers
6135 from removing the R11 move and leaving the CFA note (which would be
6136 very wrong) we tie the old and new stack pointer together.
6137 The tie will expand to nothing but the optimizers will not touch
6139 rtx stack_ptr_copy
= gen_rtx_REG (Pmode
, STACK_CLASH_SVE_CFA_REGNUM
);
6140 emit_move_insn (stack_ptr_copy
, stack_pointer_rtx
);
6141 emit_insn (gen_stack_tie (stack_ptr_copy
, stack_pointer_rtx
));
6143 /* We want the CFA independent of the stack pointer for the
6144 duration of the loop. */
6145 add_reg_note (insn
, REG_CFA_DEF_CFA
, stack_ptr_copy
);
6146 RTX_FRAME_RELATED_P (insn
) = 1;
6149 rtx probe_const
= gen_int_mode (min_probe_threshold
, Pmode
);
6150 rtx guard_const
= gen_int_mode (guard_size
, Pmode
);
6152 insn
= emit_insn (gen_probe_sve_stack_clash (Pmode
, stack_pointer_rtx
,
6153 stack_pointer_rtx
, temp1
,
6154 probe_const
, guard_const
));
6156 /* Now reset the CFA register if needed. */
6157 if (frame_related_p
)
6159 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6160 gen_rtx_PLUS (Pmode
, stack_pointer_rtx
,
6161 gen_int_mode (poly_size
, Pmode
)));
6162 RTX_FRAME_RELATED_P (insn
) = 1;
6170 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6171 " bytes, probing will be required.\n", size
);
6173 /* Round size to the nearest multiple of guard_size, and calculate the
6174 residual as the difference between the original size and the rounded
6176 HOST_WIDE_INT rounded_size
= ROUND_DOWN (size
, guard_size
);
6177 HOST_WIDE_INT residual
= size
- rounded_size
;
6179 /* We can handle a small number of allocations/probes inline. Otherwise
6181 if (rounded_size
<= STACK_CLASH_MAX_UNROLL_PAGES
* guard_size
)
6183 for (HOST_WIDE_INT i
= 0; i
< rounded_size
; i
+= guard_size
)
6185 aarch64_sub_sp (NULL
, temp2
, guard_size
, true);
6186 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6187 guard_used_by_caller
));
6188 emit_insn (gen_blockage ());
6190 dump_stack_clash_frame_info (PROBE_INLINE
, size
!= rounded_size
);
6194 /* Compute the ending address. */
6195 aarch64_add_offset (Pmode
, temp1
, stack_pointer_rtx
, -rounded_size
,
6196 temp1
, NULL
, false, true);
6197 rtx_insn
*insn
= get_last_insn ();
6199 /* For the initial allocation, we don't have a frame pointer
6200 set up, so we always need CFI notes. If we're doing the
6201 final allocation, then we may have a frame pointer, in which
6202 case it is the CFA, otherwise we need CFI notes.
6204 We can determine which allocation we are doing by looking at
6205 the value of FRAME_RELATED_P since the final allocations are not
6207 if (frame_related_p
)
6209 /* We want the CFA independent of the stack pointer for the
6210 duration of the loop. */
6211 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6212 plus_constant (Pmode
, temp1
, rounded_size
));
6213 RTX_FRAME_RELATED_P (insn
) = 1;
6216 /* This allocates and probes the stack. Note that this re-uses some of
6217 the existing Ada stack protection code. However we are guaranteed not
6218 to enter the non loop or residual branches of that code.
6220 The non-loop part won't be entered because if our allocation amount
6221 doesn't require a loop, the case above would handle it.
6223 The residual amount won't be entered because TEMP1 is a mutliple of
6224 the allocation size. The residual will always be 0. As such, the only
6225 part we are actually using from that code is the loop setup. The
6226 actual probing is done in aarch64_output_probe_stack_range. */
6227 insn
= emit_insn (gen_probe_stack_range (stack_pointer_rtx
,
6228 stack_pointer_rtx
, temp1
));
6230 /* Now reset the CFA register if needed. */
6231 if (frame_related_p
)
6233 add_reg_note (insn
, REG_CFA_DEF_CFA
,
6234 plus_constant (Pmode
, stack_pointer_rtx
, rounded_size
));
6235 RTX_FRAME_RELATED_P (insn
) = 1;
6238 emit_insn (gen_blockage ());
6239 dump_stack_clash_frame_info (PROBE_LOOP
, size
!= rounded_size
);
6242 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6243 be probed. This maintains the requirement that each page is probed at
6244 least once. For initial probing we probe only if the allocation is
6245 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6246 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6247 GUARD_SIZE. This works that for any allocation that is large enough to
6248 trigger a probe here, we'll have at least one, and if they're not large
6249 enough for this code to emit anything for them, The page would have been
6250 probed by the saving of FP/LR either by this function or any callees. If
6251 we don't have any callees then we won't have more stack adjustments and so
6255 HOST_WIDE_INT residual_probe_offset
= guard_used_by_caller
;
6256 /* If we're doing final adjustments, and we've done any full page
6257 allocations then any residual needs to be probed. */
6258 if (final_adjustment_p
&& rounded_size
!= 0)
6259 min_probe_threshold
= 0;
6260 /* If doing a small final adjustment, we always probe at offset 0.
6261 This is done to avoid issues when LR is not at position 0 or when
6262 the final adjustment is smaller than the probing offset. */
6263 else if (final_adjustment_p
&& rounded_size
== 0)
6264 residual_probe_offset
= 0;
6266 aarch64_sub_sp (temp1
, temp2
, residual
, frame_related_p
);
6267 if (residual
>= min_probe_threshold
)
6271 "Stack clash AArch64 prologue residuals: "
6272 HOST_WIDE_INT_PRINT_DEC
" bytes, probing will be required."
6275 emit_stack_probe (plus_constant (Pmode
, stack_pointer_rtx
,
6276 residual_probe_offset
));
6277 emit_insn (gen_blockage ());
6282 /* Return 1 if the register is used by the epilogue. We need to say the
6283 return register is used, but only after epilogue generation is complete.
6284 Note that in the case of sibcalls, the values "used by the epilogue" are
6285 considered live at the start of the called function.
6287 For SIMD functions we need to return 1 for FP registers that are saved and
6288 restored by a function but are not zero in call_used_regs. If we do not do
6289 this optimizations may remove the restore of the register. */
6292 aarch64_epilogue_uses (int regno
)
6294 if (epilogue_completed
)
6296 if (regno
== LR_REGNUM
)
6298 if (aarch64_simd_decl_p (cfun
->decl
) && FP_SIMD_SAVED_REGNUM_P (regno
))
6304 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6305 is saved at BASE + OFFSET. */
6308 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
6309 rtx base
, poly_int64 offset
)
6311 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
6312 add_reg_note (insn
, REG_CFA_EXPRESSION
,
6313 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
6316 /* AArch64 stack frames generated by this compiler look like:
6318 +-------------------------------+
6320 | incoming stack arguments |
6322 +-------------------------------+
6323 | | <-- incoming stack pointer (aligned)
6324 | callee-allocated save area |
6325 | for register varargs |
6327 +-------------------------------+
6328 | local variables | <-- frame_pointer_rtx
6330 +-------------------------------+
6332 +-------------------------------+ |
6333 | callee-saved registers | | frame.saved_regs_size
6334 +-------------------------------+ |
6336 +-------------------------------+ |
6337 | FP' | / <- hard_frame_pointer_rtx (aligned)
6338 +-------------------------------+
6339 | dynamic allocation |
6340 +-------------------------------+
6342 +-------------------------------+
6343 | outgoing stack arguments | <-- arg_pointer
6345 +-------------------------------+
6346 | | <-- stack_pointer_rtx (aligned)
6348 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6349 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6352 By default for stack-clash we assume the guard is at least 64KB, but this
6353 value is configurable to either 4KB or 64KB. We also force the guard size to
6354 be the same as the probing interval and both values are kept in sync.
6356 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6357 on the guard size) of stack space without probing.
6359 When probing is needed, we emit a probe at the start of the prologue
6360 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6362 We have to track how much space has been allocated and the only stores
6363 to the stack we track as implicit probes are the FP/LR stores.
6365 For outgoing arguments we probe if the size is larger than 1KB, such that
6366 the ABI specified buffer is maintained for the next callee.
6368 The following registers are reserved during frame layout and should not be
6369 used for any other purpose:
6371 - r11: Used by stack clash protection when SVE is enabled.
6372 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6373 - r14 and r15: Used for speculation tracking.
6374 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6375 - r30(LR), r29(FP): Used by standard frame layout.
6377 These registers must be avoided in frame layout related code unless the
6378 explicit intention is to interact with one of the features listed above. */
6380 /* Generate the prologue instructions for entry into a function.
6381 Establish the stack frame by decreasing the stack pointer with a
6382 properly calculated size and, if necessary, create a frame record
6383 filled with the values of LR and previous frame pointer. The
6384 current FP is also set up if it is in use. */
6387 aarch64_expand_prologue (void)
6389 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
6390 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6391 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6392 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6393 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6394 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6395 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6396 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
6399 /* Sign return address for functions. */
6400 if (aarch64_return_address_signing_enabled ())
6402 switch (aarch64_ra_sign_key
)
6405 insn
= emit_insn (gen_paciasp ());
6408 insn
= emit_insn (gen_pacibsp ());
6413 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6414 RTX_FRAME_RELATED_P (insn
) = 1;
6417 if (flag_stack_usage_info
)
6418 current_function_static_stack_size
= constant_lower_bound (frame_size
);
6420 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
6422 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
6424 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
6425 && maybe_gt (frame_size
, get_stack_check_protect ()))
6426 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6428 - get_stack_check_protect ()));
6430 else if (maybe_gt (frame_size
, 0))
6431 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
6434 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6435 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6437 /* In theory we should never have both an initial adjustment
6438 and a callee save adjustment. Verify that is the case since the
6439 code below does not handle it for -fstack-clash-protection. */
6440 gcc_assert (known_eq (initial_adjust
, 0) || callee_adjust
== 0);
6442 /* Will only probe if the initial adjustment is larger than the guard
6443 less the amount of the guard reserved for use by the caller's
6445 aarch64_allocate_and_probe_stack_space (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6448 if (callee_adjust
!= 0)
6449 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
6451 if (emit_frame_chain
)
6453 poly_int64 reg_offset
= callee_adjust
;
6454 if (callee_adjust
== 0)
6458 reg_offset
= callee_offset
;
6459 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
6461 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
6462 stack_pointer_rtx
, callee_offset
,
6463 tmp1_rtx
, tmp0_rtx
, frame_pointer_needed
);
6464 if (frame_pointer_needed
&& !frame_size
.is_constant ())
6466 /* Variable-sized frames need to describe the save slot
6467 address using DW_CFA_expression rather than DW_CFA_offset.
6468 This means that, without taking further action, the
6469 locations of the registers that we've already saved would
6470 remain based on the stack pointer even after we redefine
6471 the CFA based on the frame pointer. We therefore need new
6472 DW_CFA_expressions to re-express the save slots with addresses
6473 based on the frame pointer. */
6474 rtx_insn
*insn
= get_last_insn ();
6475 gcc_assert (RTX_FRAME_RELATED_P (insn
));
6477 /* Add an explicit CFA definition if this was previously
6479 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
6481 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
6483 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
6484 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
6487 /* Change the save slot expressions for the registers that
6488 we've already saved. */
6489 reg_offset
-= callee_offset
;
6490 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
6491 reg_offset
+ UNITS_PER_WORD
);
6492 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
6495 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
6498 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6499 callee_adjust
!= 0 || emit_frame_chain
);
6500 if (aarch64_simd_decl_p (cfun
->decl
))
6501 aarch64_save_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6502 callee_adjust
!= 0 || emit_frame_chain
);
6504 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6505 callee_adjust
!= 0 || emit_frame_chain
);
6507 /* We may need to probe the final adjustment if it is larger than the guard
6508 that is assumed by the called. */
6509 aarch64_allocate_and_probe_stack_space (tmp1_rtx
, tmp0_rtx
, final_adjust
,
6510 !frame_pointer_needed
, true);
6513 /* Return TRUE if we can use a simple_return insn.
6515 This function checks whether the callee saved stack is empty, which
6516 means no restore actions are need. The pro_and_epilogue will use
6517 this to check whether shrink-wrapping opt is feasible. */
6520 aarch64_use_return_insn_p (void)
6522 if (!reload_completed
)
6528 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
6531 /* Return false for non-leaf SIMD functions in order to avoid
6532 shrink-wrapping them. Doing this will lose the necessary
6533 save/restore of FP registers. */
6536 aarch64_use_simple_return_insn_p (void)
6538 if (aarch64_simd_decl_p (cfun
->decl
) && !crtl
->is_leaf
)
6544 /* Generate the epilogue instructions for returning from a function.
6545 This is almost exactly the reverse of the prolog sequence, except
6546 that we need to insert barriers to avoid scheduling loads that read
6547 from a deallocated stack, and we optimize the unwind records by
6548 emitting them all together if possible. */
6550 aarch64_expand_epilogue (bool for_sibcall
)
6552 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
6553 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
6554 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
6555 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
6556 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
6557 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
6560 /* A stack clash protection prologue may not have left EP0_REGNUM or
6561 EP1_REGNUM in a usable state. The same is true for allocations
6562 with an SVE component, since we then need both temporary registers
6563 for each allocation. For stack clash we are in a usable state if
6564 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6565 HOST_WIDE_INT guard_size
6566 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
6567 HOST_WIDE_INT guard_used_by_caller
= STACK_CLASH_CALLER_GUARD
;
6569 /* We can re-use the registers when the allocation amount is smaller than
6570 guard_size - guard_used_by_caller because we won't be doing any probes
6571 then. In such situations the register should remain live with the correct
6573 bool can_inherit_p
= (initial_adjust
.is_constant ()
6574 && final_adjust
.is_constant ())
6575 && (!flag_stack_clash_protection
6576 || known_lt (initial_adjust
,
6577 guard_size
- guard_used_by_caller
));
6579 /* We need to add memory barrier to prevent read from deallocated stack. */
6581 = maybe_ne (get_frame_size ()
6582 + cfun
->machine
->frame
.saved_varargs_size
, 0);
6584 /* Emit a barrier to prevent loads from a deallocated stack. */
6585 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
6586 || cfun
->calls_alloca
6587 || crtl
->calls_eh_return
)
6589 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6590 need_barrier_p
= false;
6593 /* Restore the stack pointer from the frame pointer if it may not
6594 be the same as the stack pointer. */
6595 rtx tmp0_rtx
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6596 rtx tmp1_rtx
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6597 if (frame_pointer_needed
6598 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
6599 /* If writeback is used when restoring callee-saves, the CFA
6600 is restored on the instruction doing the writeback. */
6601 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
6602 hard_frame_pointer_rtx
, -callee_offset
,
6603 tmp1_rtx
, tmp0_rtx
, callee_adjust
== 0);
6605 /* The case where we need to re-use the register here is very rare, so
6606 avoid the complicated condition and just always emit a move if the
6607 immediate doesn't fit. */
6608 aarch64_add_sp (tmp1_rtx
, tmp0_rtx
, final_adjust
, true);
6610 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
6611 callee_adjust
!= 0, &cfi_ops
);
6612 if (aarch64_simd_decl_p (cfun
->decl
))
6613 aarch64_restore_callee_saves (TFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6614 callee_adjust
!= 0, &cfi_ops
);
6616 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
6617 callee_adjust
!= 0, &cfi_ops
);
6620 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
6622 if (callee_adjust
!= 0)
6623 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
6625 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
6627 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6628 insn
= get_last_insn ();
6629 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
6630 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
6631 RTX_FRAME_RELATED_P (insn
) = 1;
6635 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6636 add restriction on emit_move optimization to leaf functions. */
6637 aarch64_add_sp (tmp0_rtx
, tmp1_rtx
, initial_adjust
,
6638 (!can_inherit_p
|| !crtl
->is_leaf
6639 || df_regs_ever_live_p (EP0_REGNUM
)));
6643 /* Emit delayed restores and reset the CFA to be SP. */
6644 insn
= get_last_insn ();
6645 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
6646 REG_NOTES (insn
) = cfi_ops
;
6647 RTX_FRAME_RELATED_P (insn
) = 1;
6650 /* We prefer to emit the combined return/authenticate instruction RETAA,
6651 however there are three cases in which we must instead emit an explicit
6652 authentication instruction.
6654 1) Sibcalls don't return in a normal way, so if we're about to call one
6655 we must authenticate.
6657 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6658 generating code for !TARGET_ARMV8_3 we can't use it and must
6659 explicitly authenticate.
6661 3) On an eh_return path we make extra stack adjustments to update the
6662 canonical frame address to be the exception handler's CFA. We want
6663 to authenticate using the CFA of the function which calls eh_return.
6665 if (aarch64_return_address_signing_enabled ()
6666 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
6668 switch (aarch64_ra_sign_key
)
6671 insn
= emit_insn (gen_autiasp ());
6674 insn
= emit_insn (gen_autibsp ());
6679 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
6680 RTX_FRAME_RELATED_P (insn
) = 1;
6683 /* Stack adjustment for exception handler. */
6684 if (crtl
->calls_eh_return
&& !for_sibcall
)
6686 /* We need to unwind the stack by the offset computed by
6687 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6688 to be SP; letting the CFA move during this adjustment
6689 is just as correct as retaining the CFA from the body
6690 of the function. Therefore, do nothing special. */
6691 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
6694 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
6696 emit_jump_insn (ret_rtx
);
6699 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6700 normally or return to a previous frame after unwinding.
6702 An EH return uses a single shared return sequence. The epilogue is
6703 exactly like a normal epilogue except that it has an extra input
6704 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6705 that must be applied after the frame has been destroyed. An extra label
6706 is inserted before the epilogue which initializes this register to zero,
6707 and this is the entry point for a normal return.
6709 An actual EH return updates the return address, initializes the stack
6710 adjustment and jumps directly into the epilogue (bypassing the zeroing
6711 of the adjustment). Since the return address is typically saved on the
6712 stack when a function makes a call, the saved LR must be updated outside
6715 This poses problems as the store is generated well before the epilogue,
6716 so the offset of LR is not known yet. Also optimizations will remove the
6717 store as it appears dead, even after the epilogue is generated (as the
6718 base or offset for loading LR is different in many cases).
6720 To avoid these problems this implementation forces the frame pointer
6721 in eh_return functions so that the location of LR is fixed and known early.
6722 It also marks the store volatile, so no optimization is permitted to
6723 remove the store. */
6725 aarch64_eh_return_handler_rtx (void)
6727 rtx tmp
= gen_frame_mem (Pmode
,
6728 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
6730 /* Mark the store volatile, so no optimization is permitted to remove it. */
6731 MEM_VOLATILE_P (tmp
) = true;
6735 /* Output code to add DELTA to the first argument, and then jump
6736 to FUNCTION. Used for C++ multiple inheritance. */
6738 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
6739 HOST_WIDE_INT delta
,
6740 HOST_WIDE_INT vcall_offset
,
6743 /* The this pointer is always in x0. Note that this differs from
6744 Arm where the this pointer maybe bumped to r1 if r0 is required
6745 to return a pointer to an aggregate. On AArch64 a result value
6746 pointer will be in x8. */
6747 int this_regno
= R0_REGNUM
;
6748 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
6750 const char *fnname
= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk
));
6752 if (aarch64_bti_enabled ())
6753 emit_insn (gen_bti_c());
6755 reload_completed
= 1;
6756 emit_note (NOTE_INSN_PROLOGUE_END
);
6758 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
6759 temp0
= gen_rtx_REG (Pmode
, EP0_REGNUM
);
6760 temp1
= gen_rtx_REG (Pmode
, EP1_REGNUM
);
6762 if (vcall_offset
== 0)
6763 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
6766 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
6771 if (delta
>= -256 && delta
< 256)
6772 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
6773 plus_constant (Pmode
, this_rtx
, delta
));
6775 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
6776 temp1
, temp0
, false);
6779 if (Pmode
== ptr_mode
)
6780 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
6782 aarch64_emit_move (temp0
,
6783 gen_rtx_ZERO_EXTEND (Pmode
,
6784 gen_rtx_MEM (ptr_mode
, addr
)));
6786 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
6787 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
6790 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
6792 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
6795 if (Pmode
== ptr_mode
)
6796 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
6798 aarch64_emit_move (temp1
,
6799 gen_rtx_SIGN_EXTEND (Pmode
,
6800 gen_rtx_MEM (ptr_mode
, addr
)));
6802 emit_insn (gen_add2_insn (this_rtx
, temp1
));
6805 /* Generate a tail call to the target function. */
6806 if (!TREE_USED (function
))
6808 assemble_external (function
);
6809 TREE_USED (function
) = 1;
6811 funexp
= XEXP (DECL_RTL (function
), 0);
6812 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
6813 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
6814 SIBLING_CALL_P (insn
) = 1;
6816 insn
= get_insns ();
6817 shorten_branches (insn
);
6819 assemble_start_function (thunk
, fnname
);
6820 final_start_function (insn
, file
, 1);
6821 final (insn
, file
, 1);
6822 final_end_function ();
6823 assemble_end_function (thunk
, fnname
);
6825 /* Stop pretending to be a post-reload pass. */
6826 reload_completed
= 0;
6830 aarch64_tls_referenced_p (rtx x
)
6832 if (!TARGET_HAVE_TLS
)
6834 subrtx_iterator::array_type array
;
6835 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
6837 const_rtx x
= *iter
;
6838 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
6840 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6841 TLS offsets, not real symbol references. */
6842 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
6843 iter
.skip_subrtxes ();
6849 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6850 a left shift of 0 or 12 bits. */
6852 aarch64_uimm12_shift (HOST_WIDE_INT val
)
6854 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
6855 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
6859 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6860 that can be created with a left shift of 0 or 12. */
6861 static HOST_WIDE_INT
6862 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val
)
6864 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6865 handle correctly. */
6866 gcc_assert ((val
& 0xffffff) == val
);
6868 if (((val
& 0xfff) << 0) == val
)
6871 return val
& (0xfff << 12);
6874 /* Return true if val is an immediate that can be loaded into a
6875 register by a MOVZ instruction. */
6877 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
6879 if (GET_MODE_SIZE (mode
) > 4)
6881 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
6882 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
6887 /* Ignore sign extension. */
6888 val
&= (HOST_WIDE_INT
) 0xffffffff;
6890 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
6891 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
6894 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6895 64-bit (DImode) integer. */
6897 static unsigned HOST_WIDE_INT
6898 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
6900 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
6903 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
6910 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6912 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
6914 0x0000000100000001ull
,
6915 0x0001000100010001ull
,
6916 0x0101010101010101ull
,
6917 0x1111111111111111ull
,
6918 0x5555555555555555ull
,
6922 /* Return true if val is a valid bitmask immediate. */
6925 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
6927 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
6930 /* Check for a single sequence of one bits and return quickly if so.
6931 The special cases of all ones and all zeroes returns false. */
6932 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
6933 tmp
= val
+ (val
& -val
);
6935 if (tmp
== (tmp
& -tmp
))
6936 return (val
+ 1) > 1;
6938 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6940 val
= (val
<< 32) | (val
& 0xffffffff);
6942 /* Invert if the immediate doesn't start with a zero bit - this means we
6943 only need to search for sequences of one bits. */
6947 /* Find the first set bit and set tmp to val with the first sequence of one
6948 bits removed. Return success if there is a single sequence of ones. */
6949 first_one
= val
& -val
;
6950 tmp
= val
& (val
+ first_one
);
6955 /* Find the next set bit and compute the difference in bit position. */
6956 next_one
= tmp
& -tmp
;
6957 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
6960 /* Check the bit position difference is a power of 2, and that the first
6961 sequence of one bits fits within 'bits' bits. */
6962 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
6965 /* Check the sequence of one bits is repeated 64/bits times. */
6966 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
6969 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6970 Assumed precondition: VAL_IN Is not zero. */
6972 unsigned HOST_WIDE_INT
6973 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
6975 int lowest_bit_set
= ctz_hwi (val_in
);
6976 int highest_bit_set
= floor_log2 (val_in
);
6977 gcc_assert (val_in
!= 0);
6979 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
6980 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
6983 /* Create constant where bits outside of lowest bit set to highest bit set
6986 unsigned HOST_WIDE_INT
6987 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
6989 return val_in
| ~aarch64_and_split_imm1 (val_in
);
6992 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6995 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
6997 scalar_int_mode int_mode
;
6998 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7001 if (aarch64_bitmask_imm (val_in
, int_mode
))
7004 if (aarch64_move_imm (val_in
, int_mode
))
7007 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
7009 return aarch64_bitmask_imm (imm2
, int_mode
);
7012 /* Return true if val is an immediate that can be loaded into a
7013 register in a single instruction. */
7015 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
7017 scalar_int_mode int_mode
;
7018 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
7021 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
7023 return aarch64_bitmask_imm (val
, int_mode
);
7027 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
7031 if (GET_CODE (x
) == HIGH
)
7034 /* There's no way to calculate VL-based values using relocations. */
7035 subrtx_iterator::array_type array
;
7036 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
7037 if (GET_CODE (*iter
) == CONST_POLY_INT
)
7040 split_const (x
, &base
, &offset
);
7041 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
7043 if (aarch64_classify_symbol (base
, INTVAL (offset
))
7044 != SYMBOL_FORCE_TO_MEM
)
7047 /* Avoid generating a 64-bit relocation in ILP32; leave
7048 to aarch64_expand_mov_immediate to handle it properly. */
7049 return mode
!= ptr_mode
;
7052 return aarch64_tls_referenced_p (x
);
7055 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7056 The expansion for a table switch is quite expensive due to the number
7057 of instructions, the table lookup and hard to predict indirect jump.
7058 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7059 set, otherwise use tables for > 16 cases as a tradeoff between size and
7060 performance. When optimizing for size, use the default setting. */
7063 aarch64_case_values_threshold (void)
7065 /* Use the specified limit for the number of cases before using jump
7066 tables at higher optimization levels. */
7068 && selected_cpu
->tune
->max_case_values
!= 0)
7069 return selected_cpu
->tune
->max_case_values
;
7071 return optimize_size
? default_case_values_threshold () : 17;
7074 /* Return true if register REGNO is a valid index register.
7075 STRICT_P is true if REG_OK_STRICT is in effect. */
7078 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
7080 if (!HARD_REGISTER_NUM_P (regno
))
7088 regno
= reg_renumber
[regno
];
7090 return GP_REGNUM_P (regno
);
7093 /* Return true if register REGNO is a valid base register for mode MODE.
7094 STRICT_P is true if REG_OK_STRICT is in effect. */
7097 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
7099 if (!HARD_REGISTER_NUM_P (regno
))
7107 regno
= reg_renumber
[regno
];
7110 /* The fake registers will be eliminated to either the stack or
7111 hard frame pointer, both of which are usually valid base registers.
7112 Reload deals with the cases where the eliminated form isn't valid. */
7113 return (GP_REGNUM_P (regno
)
7114 || regno
== SP_REGNUM
7115 || regno
== FRAME_POINTER_REGNUM
7116 || regno
== ARG_POINTER_REGNUM
);
7119 /* Return true if X is a valid base register for mode MODE.
7120 STRICT_P is true if REG_OK_STRICT is in effect. */
7123 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
7126 && GET_CODE (x
) == SUBREG
7127 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
7130 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
7133 /* Return true if address offset is a valid index. If it is, fill in INFO
7134 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7137 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
7138 machine_mode mode
, bool strict_p
)
7140 enum aarch64_address_type type
;
7145 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
7146 && GET_MODE (x
) == Pmode
)
7148 type
= ADDRESS_REG_REG
;
7152 /* (sign_extend:DI (reg:SI)) */
7153 else if ((GET_CODE (x
) == SIGN_EXTEND
7154 || GET_CODE (x
) == ZERO_EXTEND
)
7155 && GET_MODE (x
) == DImode
7156 && GET_MODE (XEXP (x
, 0)) == SImode
)
7158 type
= (GET_CODE (x
) == SIGN_EXTEND
)
7159 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7160 index
= XEXP (x
, 0);
7163 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7164 else if (GET_CODE (x
) == MULT
7165 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7166 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7167 && GET_MODE (XEXP (x
, 0)) == DImode
7168 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7169 && CONST_INT_P (XEXP (x
, 1)))
7171 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7172 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7173 index
= XEXP (XEXP (x
, 0), 0);
7174 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7176 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7177 else if (GET_CODE (x
) == ASHIFT
7178 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
7179 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
7180 && GET_MODE (XEXP (x
, 0)) == DImode
7181 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
7182 && CONST_INT_P (XEXP (x
, 1)))
7184 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
7185 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7186 index
= XEXP (XEXP (x
, 0), 0);
7187 shift
= INTVAL (XEXP (x
, 1));
7189 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7190 else if ((GET_CODE (x
) == SIGN_EXTRACT
7191 || GET_CODE (x
) == ZERO_EXTRACT
)
7192 && GET_MODE (x
) == DImode
7193 && GET_CODE (XEXP (x
, 0)) == MULT
7194 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7195 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7197 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7198 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7199 index
= XEXP (XEXP (x
, 0), 0);
7200 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7201 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7202 || INTVAL (XEXP (x
, 2)) != 0)
7205 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7206 (const_int 0xffffffff<<shift)) */
7207 else if (GET_CODE (x
) == AND
7208 && GET_MODE (x
) == DImode
7209 && GET_CODE (XEXP (x
, 0)) == MULT
7210 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7211 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7212 && CONST_INT_P (XEXP (x
, 1)))
7214 type
= ADDRESS_REG_UXTW
;
7215 index
= XEXP (XEXP (x
, 0), 0);
7216 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
7217 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7220 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7221 else if ((GET_CODE (x
) == SIGN_EXTRACT
7222 || GET_CODE (x
) == ZERO_EXTRACT
)
7223 && GET_MODE (x
) == DImode
7224 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7225 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7226 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
7228 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
7229 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
7230 index
= XEXP (XEXP (x
, 0), 0);
7231 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7232 if (INTVAL (XEXP (x
, 1)) != 32 + shift
7233 || INTVAL (XEXP (x
, 2)) != 0)
7236 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7237 (const_int 0xffffffff<<shift)) */
7238 else if (GET_CODE (x
) == AND
7239 && GET_MODE (x
) == DImode
7240 && GET_CODE (XEXP (x
, 0)) == ASHIFT
7241 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
7242 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
7243 && CONST_INT_P (XEXP (x
, 1)))
7245 type
= ADDRESS_REG_UXTW
;
7246 index
= XEXP (XEXP (x
, 0), 0);
7247 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
7248 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
7251 /* (mult:P (reg:P) (const_int scale)) */
7252 else if (GET_CODE (x
) == MULT
7253 && GET_MODE (x
) == Pmode
7254 && GET_MODE (XEXP (x
, 0)) == Pmode
7255 && CONST_INT_P (XEXP (x
, 1)))
7257 type
= ADDRESS_REG_REG
;
7258 index
= XEXP (x
, 0);
7259 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
7261 /* (ashift:P (reg:P) (const_int shift)) */
7262 else if (GET_CODE (x
) == ASHIFT
7263 && GET_MODE (x
) == Pmode
7264 && GET_MODE (XEXP (x
, 0)) == Pmode
7265 && CONST_INT_P (XEXP (x
, 1)))
7267 type
= ADDRESS_REG_REG
;
7268 index
= XEXP (x
, 0);
7269 shift
= INTVAL (XEXP (x
, 1));
7275 && GET_CODE (index
) == SUBREG
7276 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
7277 index
= SUBREG_REG (index
);
7279 if (aarch64_sve_data_mode_p (mode
))
7281 if (type
!= ADDRESS_REG_REG
7282 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
7288 && !(IN_RANGE (shift
, 1, 3)
7289 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
7294 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
7297 info
->offset
= index
;
7298 info
->shift
= shift
;
7305 /* Return true if MODE is one of the modes for which we
7306 support LDP/STP operations. */
7309 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
7311 return mode
== SImode
|| mode
== DImode
7312 || mode
== SFmode
|| mode
== DFmode
7313 || (aarch64_vector_mode_supported_p (mode
)
7314 && (known_eq (GET_MODE_SIZE (mode
), 8)
7315 || (known_eq (GET_MODE_SIZE (mode
), 16)
7316 && (aarch64_tune_params
.extra_tuning_flags
7317 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
7320 /* Return true if REGNO is a virtual pointer register, or an eliminable
7321 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7322 include stack_pointer or hard_frame_pointer. */
7324 virt_or_elim_regno_p (unsigned regno
)
7326 return ((regno
>= FIRST_VIRTUAL_REGISTER
7327 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
7328 || regno
== FRAME_POINTER_REGNUM
7329 || regno
== ARG_POINTER_REGNUM
);
7332 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7333 If it is, fill in INFO appropriately. STRICT_P is true if
7334 REG_OK_STRICT is in effect. */
7337 aarch64_classify_address (struct aarch64_address_info
*info
,
7338 rtx x
, machine_mode mode
, bool strict_p
,
7339 aarch64_addr_query_type type
)
7341 enum rtx_code code
= GET_CODE (x
);
7345 HOST_WIDE_INT const_size
;
7347 /* On BE, we use load/store pair for all large int mode load/stores.
7348 TI/TFmode may also use a load/store pair. */
7349 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7350 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
7351 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
7352 || type
== ADDR_QUERY_LDP_STP_N
7355 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
7357 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7358 corresponds to the actual size of the memory being loaded/stored and the
7359 mode of the corresponding addressing mode is half of that. */
7360 if (type
== ADDR_QUERY_LDP_STP_N
7361 && known_eq (GET_MODE_SIZE (mode
), 16))
7364 bool allow_reg_index_p
= (!load_store_pair_p
7365 && (known_lt (GET_MODE_SIZE (mode
), 16)
7366 || vec_flags
== VEC_ADVSIMD
7367 || vec_flags
& VEC_SVE_DATA
));
7369 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7370 [Rn, #offset, MUL VL]. */
7371 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
7372 && (code
!= REG
&& code
!= PLUS
))
7375 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7377 if (advsimd_struct_p
7378 && !BYTES_BIG_ENDIAN
7379 && (code
!= POST_INC
&& code
!= REG
))
7382 gcc_checking_assert (GET_MODE (x
) == VOIDmode
7383 || SCALAR_INT_MODE_P (GET_MODE (x
)));
7389 info
->type
= ADDRESS_REG_IMM
;
7391 info
->offset
= const0_rtx
;
7392 info
->const_offset
= 0;
7393 return aarch64_base_register_rtx_p (x
, strict_p
);
7401 && virt_or_elim_regno_p (REGNO (op0
))
7402 && poly_int_rtx_p (op1
, &offset
))
7404 info
->type
= ADDRESS_REG_IMM
;
7407 info
->const_offset
= offset
;
7412 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
7413 && aarch64_base_register_rtx_p (op0
, strict_p
)
7414 && poly_int_rtx_p (op1
, &offset
))
7416 info
->type
= ADDRESS_REG_IMM
;
7419 info
->const_offset
= offset
;
7421 /* TImode and TFmode values are allowed in both pairs of X
7422 registers and individual Q registers. The available
7424 X,X: 7-bit signed scaled offset
7425 Q: 9-bit signed offset
7426 We conservatively require an offset representable in either mode.
7427 When performing the check for pairs of X registers i.e. LDP/STP
7428 pass down DImode since that is the natural size of the LDP/STP
7429 instruction memory accesses. */
7430 if (mode
== TImode
|| mode
== TFmode
)
7431 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
7432 && (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7433 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
7435 /* A 7bit offset check because OImode will emit a ldp/stp
7436 instruction (only big endian will get here).
7437 For ldp/stp instructions, the offset is scaled for the size of a
7438 single element of the pair. */
7440 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
7442 /* Three 9/12 bit offsets checks because CImode will emit three
7443 ldr/str instructions (only big endian will get here). */
7445 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7446 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode
,
7448 || offset_12bit_unsigned_scaled_p (V16QImode
,
7451 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7452 instructions (only big endian will get here). */
7454 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
7455 && aarch64_offset_7bit_signed_scaled_p (TImode
,
7458 /* Make "m" use the LD1 offset range for SVE data modes, so
7459 that pre-RTL optimizers like ivopts will work to that
7460 instead of the wider LDR/STR range. */
7461 if (vec_flags
== VEC_SVE_DATA
)
7462 return (type
== ADDR_QUERY_M
7463 ? offset_4bit_signed_scaled_p (mode
, offset
)
7464 : offset_9bit_signed_scaled_p (mode
, offset
));
7466 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
7468 poly_int64 end_offset
= (offset
7469 + GET_MODE_SIZE (mode
)
7470 - BYTES_PER_SVE_VECTOR
);
7471 return (type
== ADDR_QUERY_M
7472 ? offset_4bit_signed_scaled_p (mode
, offset
)
7473 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
7474 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
7478 if (vec_flags
== VEC_SVE_PRED
)
7479 return offset_9bit_signed_scaled_p (mode
, offset
);
7481 if (load_store_pair_p
)
7482 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7483 || known_eq (GET_MODE_SIZE (mode
), 8)
7484 || known_eq (GET_MODE_SIZE (mode
), 16))
7485 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7487 return (aarch64_offset_9bit_signed_unscaled_p (mode
, offset
)
7488 || offset_12bit_unsigned_scaled_p (mode
, offset
));
7491 if (allow_reg_index_p
)
7493 /* Look for base + (scaled/extended) index register. */
7494 if (aarch64_base_register_rtx_p (op0
, strict_p
)
7495 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
7500 if (aarch64_base_register_rtx_p (op1
, strict_p
)
7501 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
7514 info
->type
= ADDRESS_REG_WB
;
7515 info
->base
= XEXP (x
, 0);
7516 info
->offset
= NULL_RTX
;
7517 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
7521 info
->type
= ADDRESS_REG_WB
;
7522 info
->base
= XEXP (x
, 0);
7523 if (GET_CODE (XEXP (x
, 1)) == PLUS
7524 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
7525 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
7526 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7528 info
->offset
= XEXP (XEXP (x
, 1), 1);
7529 info
->const_offset
= offset
;
7531 /* TImode and TFmode values are allowed in both pairs of X
7532 registers and individual Q registers. The available
7534 X,X: 7-bit signed scaled offset
7535 Q: 9-bit signed offset
7536 We conservatively require an offset representable in either mode.
7538 if (mode
== TImode
|| mode
== TFmode
)
7539 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
7540 && aarch64_offset_9bit_signed_unscaled_p (mode
, offset
));
7542 if (load_store_pair_p
)
7543 return ((known_eq (GET_MODE_SIZE (mode
), 4)
7544 || known_eq (GET_MODE_SIZE (mode
), 8)
7545 || known_eq (GET_MODE_SIZE (mode
), 16))
7546 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
7548 return aarch64_offset_9bit_signed_unscaled_p (mode
, offset
);
7555 /* load literal: pc-relative constant pool entry. Only supported
7556 for SI mode or larger. */
7557 info
->type
= ADDRESS_SYMBOLIC
;
7559 if (!load_store_pair_p
7560 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
7565 split_const (x
, &sym
, &addend
);
7566 return ((GET_CODE (sym
) == LABEL_REF
7567 || (GET_CODE (sym
) == SYMBOL_REF
7568 && CONSTANT_POOL_ADDRESS_P (sym
)
7569 && aarch64_pcrelative_literal_loads
)));
7574 info
->type
= ADDRESS_LO_SUM
;
7575 info
->base
= XEXP (x
, 0);
7576 info
->offset
= XEXP (x
, 1);
7577 if (allow_reg_index_p
7578 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
7581 split_const (info
->offset
, &sym
, &offs
);
7582 if (GET_CODE (sym
) == SYMBOL_REF
7583 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
7584 == SYMBOL_SMALL_ABSOLUTE
))
7586 /* The symbol and offset must be aligned to the access size. */
7589 if (CONSTANT_POOL_ADDRESS_P (sym
))
7590 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
7591 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
7593 tree exp
= SYMBOL_REF_DECL (sym
);
7594 align
= TYPE_ALIGN (TREE_TYPE (exp
));
7595 align
= aarch64_constant_alignment (exp
, align
);
7597 else if (SYMBOL_REF_DECL (sym
))
7598 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
7599 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
7600 && SYMBOL_REF_BLOCK (sym
) != NULL
)
7601 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
7603 align
= BITS_PER_UNIT
;
7605 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
7606 if (known_eq (ref_size
, 0))
7607 ref_size
= GET_MODE_SIZE (DImode
);
7609 return (multiple_p (INTVAL (offs
), ref_size
)
7610 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
7620 /* Return true if the address X is valid for a PRFM instruction.
7621 STRICT_P is true if we should do strict checking with
7622 aarch64_classify_address. */
7625 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
7627 struct aarch64_address_info addr
;
7629 /* PRFM accepts the same addresses as DImode... */
7630 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
7634 /* ... except writeback forms. */
7635 return addr
.type
!= ADDRESS_REG_WB
;
7639 aarch64_symbolic_address_p (rtx x
)
7643 split_const (x
, &x
, &offset
);
7644 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
7647 /* Classify the base of symbolic expression X. */
7649 enum aarch64_symbol_type
7650 aarch64_classify_symbolic_expression (rtx x
)
7654 split_const (x
, &x
, &offset
);
7655 return aarch64_classify_symbol (x
, INTVAL (offset
));
7659 /* Return TRUE if X is a legitimate address for accessing memory in
7662 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
7664 struct aarch64_address_info addr
;
7666 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
7669 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7670 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7672 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
7673 aarch64_addr_query_type type
)
7675 struct aarch64_address_info addr
;
7677 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
7680 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7683 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
7684 poly_int64 orig_offset
,
7688 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7690 HOST_WIDE_INT const_offset
, second_offset
;
7692 /* A general SVE offset is A * VQ + B. Remove the A component from
7693 coefficient 0 in order to get the constant B. */
7694 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
7696 /* Split an out-of-range address displacement into a base and
7697 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7698 range otherwise to increase opportunities for sharing the base
7699 address of different sizes. Unaligned accesses use the signed
7700 9-bit range, TImode/TFmode use the intersection of signed
7701 scaled 7-bit and signed 9-bit offset. */
7702 if (mode
== TImode
|| mode
== TFmode
)
7703 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
7704 else if ((const_offset
& (size
- 1)) != 0)
7705 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
7707 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
7709 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
7712 /* Split the offset into second_offset and the rest. */
7713 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7714 *offset2
= gen_int_mode (second_offset
, Pmode
);
7719 /* Get the mode we should use as the basis of the range. For structure
7720 modes this is the mode of one vector. */
7721 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
7722 machine_mode step_mode
7723 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
7725 /* Get the "mul vl" multiplier we'd like to use. */
7726 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
7727 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
7728 if (vec_flags
& VEC_SVE_DATA
)
7729 /* LDR supports a 9-bit range, but the move patterns for
7730 structure modes require all vectors to be in range of the
7731 same base. The simplest way of accomodating that while still
7732 promoting reuse of anchor points between different modes is
7733 to use an 8-bit range unconditionally. */
7734 vnum
= ((vnum
+ 128) & 255) - 128;
7736 /* Predicates are only handled singly, so we might as well use
7738 vnum
= ((vnum
+ 256) & 511) - 256;
7742 /* Convert the "mul vl" multiplier into a byte offset. */
7743 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
7744 if (known_eq (second_offset
, orig_offset
))
7747 /* Split the offset into second_offset and the rest. */
7748 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
7749 *offset2
= gen_int_mode (second_offset
, Pmode
);
7754 /* Return the binary representation of floating point constant VALUE in INTVAL.
7755 If the value cannot be converted, return false without setting INTVAL.
7756 The conversion is done in the given MODE. */
7758 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
7761 /* We make a general exception for 0. */
7762 if (aarch64_float_const_zero_rtx_p (value
))
7768 scalar_float_mode mode
;
7769 if (GET_CODE (value
) != CONST_DOUBLE
7770 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
7771 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
7772 /* Only support up to DF mode. */
7773 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
7776 unsigned HOST_WIDE_INT ival
= 0;
7779 real_to_target (res
,
7780 CONST_DOUBLE_REAL_VALUE (value
),
7781 REAL_MODE_FORMAT (mode
));
7785 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
7786 ival
= zext_hwi (res
[order
], 32);
7787 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
7790 ival
= zext_hwi (res
[0], 32);
7796 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7797 single MOV(+MOVK) followed by an FMOV. */
7799 aarch64_float_const_rtx_p (rtx x
)
7801 machine_mode mode
= GET_MODE (x
);
7802 if (mode
== VOIDmode
)
7805 /* Determine whether it's cheaper to write float constants as
7806 mov/movk pairs over ldr/adrp pairs. */
7807 unsigned HOST_WIDE_INT ival
;
7809 if (GET_CODE (x
) == CONST_DOUBLE
7810 && SCALAR_FLOAT_MODE_P (mode
)
7811 && aarch64_reinterpret_float_as_int (x
, &ival
))
7813 scalar_int_mode imode
= (mode
== HFmode
7815 : int_mode_for_mode (mode
).require ());
7816 int num_instr
= aarch64_internal_mov_immediate
7817 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
7818 return num_instr
< 3;
7824 /* Return TRUE if rtx X is immediate constant 0.0 */
7826 aarch64_float_const_zero_rtx_p (rtx x
)
7828 if (GET_MODE (x
) == VOIDmode
)
7831 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
7832 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
7833 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
7836 /* Return TRUE if rtx X is immediate constant that fits in a single
7837 MOVI immediate operation. */
7839 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
7845 scalar_int_mode imode
;
7846 unsigned HOST_WIDE_INT ival
;
7848 if (GET_CODE (x
) == CONST_DOUBLE
7849 && SCALAR_FLOAT_MODE_P (mode
))
7851 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
7854 /* We make a general exception for 0. */
7855 if (aarch64_float_const_zero_rtx_p (x
))
7858 imode
= int_mode_for_mode (mode
).require ();
7860 else if (GET_CODE (x
) == CONST_INT
7861 && is_a
<scalar_int_mode
> (mode
, &imode
))
7866 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7867 a 128 bit vector mode. */
7868 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
7870 vmode
= aarch64_simd_container_mode (imode
, width
);
7871 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
7873 return aarch64_simd_valid_immediate (v_op
, NULL
);
7877 /* Return the fixed registers used for condition codes. */
7880 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
7883 *p2
= INVALID_REGNUM
;
7887 /* This function is used by the call expanders of the machine description.
7888 RESULT is the register in which the result is returned. It's NULL for
7889 "call" and "sibcall".
7890 MEM is the location of the function call.
7891 SIBCALL indicates whether this function call is normal call or sibling call.
7892 It will generate different pattern accordingly. */
7895 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
7897 rtx call
, callee
, tmp
;
7901 gcc_assert (MEM_P (mem
));
7902 callee
= XEXP (mem
, 0);
7903 mode
= GET_MODE (callee
);
7904 gcc_assert (mode
== Pmode
);
7906 /* Decide if we should generate indirect calls by loading the
7907 address of the callee into a register before performing
7908 the branch-and-link. */
7909 if (SYMBOL_REF_P (callee
)
7910 ? (aarch64_is_long_call_p (callee
)
7911 || aarch64_is_noplt_call_p (callee
))
7913 XEXP (mem
, 0) = force_reg (mode
, callee
);
7915 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
7917 if (result
!= NULL_RTX
)
7918 call
= gen_rtx_SET (result
, call
);
7923 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
7925 vec
= gen_rtvec (2, call
, tmp
);
7926 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
7928 aarch64_emit_call_insn (call
);
7931 /* Emit call insn with PAT and do aarch64-specific handling. */
7934 aarch64_emit_call_insn (rtx pat
)
7936 rtx insn
= emit_call_insn (pat
);
7938 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
7939 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
7940 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
7944 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
7946 machine_mode mode_x
= GET_MODE (x
);
7947 rtx_code code_x
= GET_CODE (x
);
7949 /* All floating point compares return CCFP if it is an equality
7950 comparison, and CCFPE otherwise. */
7951 if (GET_MODE_CLASS (mode_x
) == MODE_FLOAT
)
7978 /* Equality comparisons of short modes against zero can be performed
7979 using the TST instruction with the appropriate bitmask. */
7980 if (y
== const0_rtx
&& (REG_P (x
) || SUBREG_P (x
))
7981 && (code
== EQ
|| code
== NE
)
7982 && (mode_x
== HImode
|| mode_x
== QImode
))
7985 /* Similarly, comparisons of zero_extends from shorter modes can
7986 be performed using an ANDS with an immediate mask. */
7987 if (y
== const0_rtx
&& code_x
== ZERO_EXTEND
7988 && (mode_x
== SImode
|| mode_x
== DImode
)
7989 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
7990 && (code
== EQ
|| code
== NE
))
7993 if ((mode_x
== SImode
|| mode_x
== DImode
)
7995 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
7996 && (code_x
== PLUS
|| code_x
== MINUS
|| code_x
== AND
7998 || (code_x
== ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
7999 && CONST_INT_P (XEXP (x
, 2)))))
8002 /* A compare with a shifted operand. Because of canonicalization,
8003 the comparison will have to be swapped when we emit the assembly
8005 if ((mode_x
== SImode
|| mode_x
== DImode
)
8006 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
8007 && (code_x
== ASHIFT
|| code_x
== ASHIFTRT
8008 || code_x
== LSHIFTRT
8009 || code_x
== ZERO_EXTEND
|| code_x
== SIGN_EXTEND
))
8012 /* Similarly for a negated operand, but we can only do this for
8014 if ((mode_x
== SImode
|| mode_x
== DImode
)
8015 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
8016 && (code
== EQ
|| code
== NE
)
8020 /* A test for unsigned overflow from an addition. */
8021 if ((mode_x
== DImode
|| mode_x
== TImode
)
8022 && (code
== LTU
|| code
== GEU
)
8024 && rtx_equal_p (XEXP (x
, 0), y
))
8027 /* A test for unsigned overflow from an add with carry. */
8028 if ((mode_x
== DImode
|| mode_x
== TImode
)
8029 && (code
== LTU
|| code
== GEU
)
8031 && CONST_SCALAR_INT_P (y
)
8032 && (rtx_mode_t (y
, mode_x
)
8033 == (wi::shwi (1, mode_x
)
8034 << (GET_MODE_BITSIZE (mode_x
).to_constant () / 2))))
8037 /* A test for signed overflow. */
8038 if ((mode_x
== DImode
|| mode_x
== TImode
)
8041 && GET_CODE (y
) == SIGN_EXTEND
)
8044 /* For everything else, return CCmode. */
8049 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
8052 aarch64_get_condition_code (rtx x
)
8054 machine_mode mode
= GET_MODE (XEXP (x
, 0));
8055 enum rtx_code comp_code
= GET_CODE (x
);
8057 if (GET_MODE_CLASS (mode
) != MODE_CC
)
8058 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
8059 return aarch64_get_condition_code_1 (mode
, comp_code
);
8063 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
8071 case GE
: return AARCH64_GE
;
8072 case GT
: return AARCH64_GT
;
8073 case LE
: return AARCH64_LS
;
8074 case LT
: return AARCH64_MI
;
8075 case NE
: return AARCH64_NE
;
8076 case EQ
: return AARCH64_EQ
;
8077 case ORDERED
: return AARCH64_VC
;
8078 case UNORDERED
: return AARCH64_VS
;
8079 case UNLT
: return AARCH64_LT
;
8080 case UNLE
: return AARCH64_LE
;
8081 case UNGT
: return AARCH64_HI
;
8082 case UNGE
: return AARCH64_PL
;
8090 case NE
: return AARCH64_NE
;
8091 case EQ
: return AARCH64_EQ
;
8092 case GE
: return AARCH64_GE
;
8093 case GT
: return AARCH64_GT
;
8094 case LE
: return AARCH64_LE
;
8095 case LT
: return AARCH64_LT
;
8096 case GEU
: return AARCH64_CS
;
8097 case GTU
: return AARCH64_HI
;
8098 case LEU
: return AARCH64_LS
;
8099 case LTU
: return AARCH64_CC
;
8107 case NE
: return AARCH64_NE
;
8108 case EQ
: return AARCH64_EQ
;
8109 case GE
: return AARCH64_LE
;
8110 case GT
: return AARCH64_LT
;
8111 case LE
: return AARCH64_GE
;
8112 case LT
: return AARCH64_GT
;
8113 case GEU
: return AARCH64_LS
;
8114 case GTU
: return AARCH64_CC
;
8115 case LEU
: return AARCH64_CS
;
8116 case LTU
: return AARCH64_HI
;
8124 case NE
: return AARCH64_NE
; /* = any */
8125 case EQ
: return AARCH64_EQ
; /* = none */
8126 case GE
: return AARCH64_PL
; /* = nfrst */
8127 case LT
: return AARCH64_MI
; /* = first */
8128 case GEU
: return AARCH64_CS
; /* = nlast */
8129 case GTU
: return AARCH64_HI
; /* = pmore */
8130 case LEU
: return AARCH64_LS
; /* = plast */
8131 case LTU
: return AARCH64_CC
; /* = last */
8139 case NE
: return AARCH64_NE
;
8140 case EQ
: return AARCH64_EQ
;
8141 case GE
: return AARCH64_PL
;
8142 case LT
: return AARCH64_MI
;
8150 case NE
: return AARCH64_NE
;
8151 case EQ
: return AARCH64_EQ
;
8159 case LTU
: return AARCH64_CS
;
8160 case GEU
: return AARCH64_CC
;
8168 case GEU
: return AARCH64_CS
;
8169 case LTU
: return AARCH64_CC
;
8177 case NE
: return AARCH64_VS
;
8178 case EQ
: return AARCH64_VC
;
8191 aarch64_const_vec_all_same_in_range_p (rtx x
,
8192 HOST_WIDE_INT minval
,
8193 HOST_WIDE_INT maxval
)
8196 return (const_vec_duplicate_p (x
, &elt
)
8197 && CONST_INT_P (elt
)
8198 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
8202 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
8204 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
8207 /* Return true if VEC is a constant in which every element is in the range
8208 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8211 aarch64_const_vec_all_in_range_p (rtx vec
,
8212 HOST_WIDE_INT minval
,
8213 HOST_WIDE_INT maxval
)
8215 if (GET_CODE (vec
) != CONST_VECTOR
8216 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
8220 if (!CONST_VECTOR_STEPPED_P (vec
))
8221 nunits
= const_vector_encoded_nelts (vec
);
8222 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
8225 for (int i
= 0; i
< nunits
; i
++)
8227 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
8228 if (!CONST_INT_P (vec_elem
)
8229 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
8236 #define AARCH64_CC_V 1
8237 #define AARCH64_CC_C (1 << 1)
8238 #define AARCH64_CC_Z (1 << 2)
8239 #define AARCH64_CC_N (1 << 3)
8241 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8242 static const int aarch64_nzcv_codes
[] =
8244 0, /* EQ, Z == 1. */
8245 AARCH64_CC_Z
, /* NE, Z == 0. */
8246 0, /* CS, C == 1. */
8247 AARCH64_CC_C
, /* CC, C == 0. */
8248 0, /* MI, N == 1. */
8249 AARCH64_CC_N
, /* PL, N == 0. */
8250 0, /* VS, V == 1. */
8251 AARCH64_CC_V
, /* VC, V == 0. */
8252 0, /* HI, C ==1 && Z == 0. */
8253 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
8254 AARCH64_CC_V
, /* GE, N == V. */
8255 0, /* LT, N != V. */
8256 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
8257 0, /* LE, !(Z == 0 && N == V). */
8262 /* Print floating-point vector immediate operand X to F, negating it
8263 first if NEGATE is true. Return true on success, false if it isn't
8264 a constant we can handle. */
8267 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
8271 if (!const_vec_duplicate_p (x
, &elt
))
8274 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
8276 r
= real_value_negate (&r
);
8278 /* We only handle the SVE single-bit immediates here. */
8279 if (real_equal (&r
, &dconst0
))
8280 asm_fprintf (f
, "0.0");
8281 else if (real_equal (&r
, &dconst1
))
8282 asm_fprintf (f
, "1.0");
8283 else if (real_equal (&r
, &dconsthalf
))
8284 asm_fprintf (f
, "0.5");
8291 /* Return the equivalent letter for size. */
8293 sizetochar (int size
)
8297 case 64: return 'd';
8298 case 32: return 's';
8299 case 16: return 'h';
8300 case 8 : return 'b';
8301 default: gcc_unreachable ();
8305 /* Print operand X to file F in a target specific manner according to CODE.
8306 The acceptable formatting commands given by CODE are:
8307 'c': An integer or symbol address without a preceding #
8309 'C': Take the duplicated element in a vector constant
8310 and print it in hex.
8311 'D': Take the duplicated element in a vector constant
8312 and print it as an unsigned integer, in decimal.
8313 'e': Print the sign/zero-extend size as a character 8->b,
8315 'p': Prints N such that 2^N == X (X must be power of 2 and
8317 'P': Print the number of non-zero bits in X (a const_int).
8318 'H': Print the higher numbered register of a pair (TImode)
8320 'm': Print a condition (eq, ne, etc).
8321 'M': Same as 'm', but invert condition.
8322 'N': Take the duplicated element in a vector constant
8323 and print the negative of it in decimal.
8324 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8325 'S/T/U/V': Print a FP/SIMD register name for a register list.
8326 The register printed is the FP/SIMD register name
8327 of X + 0/1/2/3 for S/T/U/V.
8328 'R': Print a scalar FP/SIMD register name + 1.
8329 'X': Print bottom 16 bits of integer constant in hex.
8330 'w/x': Print a general register name or the zero register
8332 '0': Print a normal operand, if it's a general register,
8333 then we assume DImode.
8334 'k': Print NZCV for conditional compare instructions.
8335 'A': Output address constant representing the first
8336 argument of X, specifying a relocation offset
8338 'L': Output constant address specified by X
8339 with a relocation offset if appropriate.
8340 'G': Prints address of X, specifying a PC relative
8341 relocation mode if appropriate.
8342 'y': Output address of LDP or STP - this is used for
8343 some LDP/STPs which don't use a PARALLEL in their
8344 pattern (so the mode needs to be adjusted).
8345 'z': Output address of a typical LDP or STP. */
8348 aarch64_print_operand (FILE *f
, rtx x
, int code
)
8354 switch (GET_CODE (x
))
8357 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
8361 output_addr_const (f
, x
);
8365 if (GET_CODE (XEXP (x
, 0)) == PLUS
8366 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
8368 output_addr_const (f
, x
);
8374 output_operand_lossage ("unsupported operand for code '%c'", code
);
8382 if (!CONST_INT_P (x
)
8383 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
8385 output_operand_lossage ("invalid operand for '%%%c'", code
);
8401 output_operand_lossage ("invalid operand for '%%%c'", code
);
8411 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
8413 output_operand_lossage ("invalid operand for '%%%c'", code
);
8417 asm_fprintf (f
, "%d", n
);
8422 if (!CONST_INT_P (x
))
8424 output_operand_lossage ("invalid operand for '%%%c'", code
);
8428 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
8432 if (x
== const0_rtx
)
8434 asm_fprintf (f
, "xzr");
8438 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
8440 output_operand_lossage ("invalid operand for '%%%c'", code
);
8444 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
8451 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8452 if (x
== const_true_rtx
)
8459 if (!COMPARISON_P (x
))
8461 output_operand_lossage ("invalid operand for '%%%c'", code
);
8465 cond_code
= aarch64_get_condition_code (x
);
8466 gcc_assert (cond_code
>= 0);
8468 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
8469 if (GET_MODE (XEXP (x
, 0)) == CC_NZCmode
)
8470 fputs (aarch64_sve_condition_codes
[cond_code
], f
);
8472 fputs (aarch64_condition_codes
[cond_code
], f
);
8477 if (!const_vec_duplicate_p (x
, &elt
))
8479 output_operand_lossage ("invalid vector constant");
8483 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8484 asm_fprintf (f
, "%wd", -INTVAL (elt
));
8485 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8486 && aarch64_print_vector_float_operand (f
, x
, true))
8490 output_operand_lossage ("invalid vector constant");
8500 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8502 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8505 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
8512 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8514 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8517 asm_fprintf (f
, "%c%d",
8518 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
8519 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
8523 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
8525 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
8528 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
8532 if (!CONST_INT_P (x
))
8534 output_operand_lossage ("invalid operand for '%%%c'", code
);
8537 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
8542 /* Print a replicated constant in hex. */
8543 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8545 output_operand_lossage ("invalid operand for '%%%c'", code
);
8548 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8549 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8555 /* Print a replicated constant in decimal, treating it as
8557 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
8559 output_operand_lossage ("invalid operand for '%%%c'", code
);
8562 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
8563 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
8570 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
8572 asm_fprintf (f
, "%czr", code
);
8576 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
8578 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
8582 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
8584 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
8593 output_operand_lossage ("missing operand");
8597 switch (GET_CODE (x
))
8600 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
8602 if (REG_NREGS (x
) == 1)
8603 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
8607 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
8608 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
8609 REGNO (x
) - V0_REGNUM
, suffix
,
8610 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
8614 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
8618 output_address (GET_MODE (x
), XEXP (x
, 0));
8623 output_addr_const (asm_out_file
, x
);
8627 asm_fprintf (f
, "%wd", INTVAL (x
));
8631 if (!VECTOR_MODE_P (GET_MODE (x
)))
8633 output_addr_const (asm_out_file
, x
);
8639 if (!const_vec_duplicate_p (x
, &elt
))
8641 output_operand_lossage ("invalid vector constant");
8645 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
8646 asm_fprintf (f
, "%wd", INTVAL (elt
));
8647 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
8648 && aarch64_print_vector_float_operand (f
, x
, false))
8652 output_operand_lossage ("invalid vector constant");
8658 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8659 be getting CONST_DOUBLEs holding integers. */
8660 gcc_assert (GET_MODE (x
) != VOIDmode
);
8661 if (aarch64_float_const_zero_rtx_p (x
))
8666 else if (aarch64_float_const_representable_p (x
))
8669 char float_buf
[buf_size
] = {'\0'};
8670 real_to_decimal_for_mode (float_buf
,
8671 CONST_DOUBLE_REAL_VALUE (x
),
8674 asm_fprintf (asm_out_file
, "%s", float_buf
);
8678 output_operand_lossage ("invalid constant");
8681 output_operand_lossage ("invalid operand");
8687 if (GET_CODE (x
) == HIGH
)
8690 switch (aarch64_classify_symbolic_expression (x
))
8692 case SYMBOL_SMALL_GOT_4G
:
8693 asm_fprintf (asm_out_file
, ":got:");
8696 case SYMBOL_SMALL_TLSGD
:
8697 asm_fprintf (asm_out_file
, ":tlsgd:");
8700 case SYMBOL_SMALL_TLSDESC
:
8701 asm_fprintf (asm_out_file
, ":tlsdesc:");
8704 case SYMBOL_SMALL_TLSIE
:
8705 asm_fprintf (asm_out_file
, ":gottprel:");
8708 case SYMBOL_TLSLE24
:
8709 asm_fprintf (asm_out_file
, ":tprel:");
8712 case SYMBOL_TINY_GOT
:
8719 output_addr_const (asm_out_file
, x
);
8723 switch (aarch64_classify_symbolic_expression (x
))
8725 case SYMBOL_SMALL_GOT_4G
:
8726 asm_fprintf (asm_out_file
, ":lo12:");
8729 case SYMBOL_SMALL_TLSGD
:
8730 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
8733 case SYMBOL_SMALL_TLSDESC
:
8734 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
8737 case SYMBOL_SMALL_TLSIE
:
8738 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
8741 case SYMBOL_TLSLE12
:
8742 asm_fprintf (asm_out_file
, ":tprel_lo12:");
8745 case SYMBOL_TLSLE24
:
8746 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
8749 case SYMBOL_TINY_GOT
:
8750 asm_fprintf (asm_out_file
, ":got:");
8753 case SYMBOL_TINY_TLSIE
:
8754 asm_fprintf (asm_out_file
, ":gottprel:");
8760 output_addr_const (asm_out_file
, x
);
8764 switch (aarch64_classify_symbolic_expression (x
))
8766 case SYMBOL_TLSLE24
:
8767 asm_fprintf (asm_out_file
, ":tprel_hi12:");
8772 output_addr_const (asm_out_file
, x
);
8777 HOST_WIDE_INT cond_code
;
8779 if (!CONST_INT_P (x
))
8781 output_operand_lossage ("invalid operand for '%%%c'", code
);
8785 cond_code
= INTVAL (x
);
8786 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
8787 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
8794 machine_mode mode
= GET_MODE (x
);
8796 if (GET_CODE (x
) != MEM
8797 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
8799 output_operand_lossage ("invalid operand for '%%%c'", code
);
8803 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
8805 ? ADDR_QUERY_LDP_STP_N
8806 : ADDR_QUERY_LDP_STP
))
8807 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8812 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
8817 /* Print address 'x' of a memory access with mode 'mode'.
8818 'op' is the context required by aarch64_classify_address. It can either be
8819 MEM for a normal memory access or PARALLEL for LDP/STP. */
8821 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
8822 aarch64_addr_query_type type
)
8824 struct aarch64_address_info addr
;
8827 /* Check all addresses are Pmode - including ILP32. */
8828 if (GET_MODE (x
) != Pmode
8829 && (!CONST_INT_P (x
)
8830 || trunc_int_for_mode (INTVAL (x
), Pmode
) != INTVAL (x
)))
8832 output_operand_lossage ("invalid address mode");
8836 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
8839 case ADDRESS_REG_IMM
:
8840 if (known_eq (addr
.const_offset
, 0))
8841 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
8842 else if (aarch64_sve_data_mode_p (mode
))
8845 = exact_div (addr
.const_offset
,
8846 BYTES_PER_SVE_VECTOR
).to_constant ();
8847 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8848 reg_names
[REGNO (addr
.base
)], vnum
);
8850 else if (aarch64_sve_pred_mode_p (mode
))
8853 = exact_div (addr
.const_offset
,
8854 BYTES_PER_SVE_PRED
).to_constant ();
8855 asm_fprintf (f
, "[%s, #%wd, mul vl]",
8856 reg_names
[REGNO (addr
.base
)], vnum
);
8859 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
8860 INTVAL (addr
.offset
));
8863 case ADDRESS_REG_REG
:
8864 if (addr
.shift
== 0)
8865 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
8866 reg_names
[REGNO (addr
.offset
)]);
8868 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
8869 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
8872 case ADDRESS_REG_UXTW
:
8873 if (addr
.shift
== 0)
8874 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
8875 REGNO (addr
.offset
) - R0_REGNUM
);
8877 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
8878 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8881 case ADDRESS_REG_SXTW
:
8882 if (addr
.shift
== 0)
8883 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
8884 REGNO (addr
.offset
) - R0_REGNUM
);
8886 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
8887 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
8890 case ADDRESS_REG_WB
:
8891 /* Writeback is only supported for fixed-width modes. */
8892 size
= GET_MODE_SIZE (mode
).to_constant ();
8893 switch (GET_CODE (x
))
8896 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
8899 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
8902 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
8905 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
8908 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
8909 INTVAL (addr
.offset
));
8912 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
8913 INTVAL (addr
.offset
));
8920 case ADDRESS_LO_SUM
:
8921 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
8922 output_addr_const (f
, addr
.offset
);
8923 asm_fprintf (f
, "]");
8926 case ADDRESS_SYMBOLIC
:
8927 output_addr_const (f
, x
);
8934 /* Print address 'x' of a memory access with mode 'mode'. */
8936 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
8938 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
8939 output_addr_const (f
, x
);
8943 aarch64_label_mentioned_p (rtx x
)
8948 if (GET_CODE (x
) == LABEL_REF
)
8951 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8952 referencing instruction, but they are constant offsets, not
8954 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
8957 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
8958 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
8964 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
8965 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
8968 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
8975 /* Implement REGNO_REG_CLASS. */
8978 aarch64_regno_regclass (unsigned regno
)
8980 if (GP_REGNUM_P (regno
))
8981 return GENERAL_REGS
;
8983 if (regno
== SP_REGNUM
)
8986 if (regno
== FRAME_POINTER_REGNUM
8987 || regno
== ARG_POINTER_REGNUM
)
8988 return POINTER_REGS
;
8990 if (FP_REGNUM_P (regno
))
8991 return (FP_LO8_REGNUM_P (regno
) ? FP_LO8_REGS
8992 : FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
);
8994 if (PR_REGNUM_P (regno
))
8995 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
9000 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9001 If OFFSET is out of range, return an offset of an anchor point
9002 that is in range. Return 0 otherwise. */
9004 static HOST_WIDE_INT
9005 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
9008 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9010 return (offset
+ 0x400) & ~0x7f0;
9012 /* For offsets that aren't a multiple of the access size, the limit is
9014 if (offset
& (size
- 1))
9016 /* BLKmode typically uses LDP of X-registers. */
9017 if (mode
== BLKmode
)
9018 return (offset
+ 512) & ~0x3ff;
9019 return (offset
+ 0x100) & ~0x1ff;
9022 /* Small negative offsets are supported. */
9023 if (IN_RANGE (offset
, -256, 0))
9026 if (mode
== TImode
|| mode
== TFmode
)
9027 return (offset
+ 0x100) & ~0x1ff;
9029 /* Use 12-bit offset by access size. */
9030 return offset
& (~0xfff * size
);
9034 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
9036 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9037 where mask is selected by alignment and size of the offset.
9038 We try to pick as large a range for the offset as possible to
9039 maximize the chance of a CSE. However, for aligned addresses
9040 we limit the range to 4k so that structures with different sized
9041 elements are likely to use the same base. We need to be careful
9042 not to split a CONST for some forms of address expression, otherwise
9043 it will generate sub-optimal code. */
9045 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
9047 rtx base
= XEXP (x
, 0);
9048 rtx offset_rtx
= XEXP (x
, 1);
9049 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
9051 if (GET_CODE (base
) == PLUS
)
9053 rtx op0
= XEXP (base
, 0);
9054 rtx op1
= XEXP (base
, 1);
9056 /* Force any scaling into a temp for CSE. */
9057 op0
= force_reg (Pmode
, op0
);
9058 op1
= force_reg (Pmode
, op1
);
9060 /* Let the pointer register be in op0. */
9061 if (REG_POINTER (op1
))
9062 std::swap (op0
, op1
);
9064 /* If the pointer is virtual or frame related, then we know that
9065 virtual register instantiation or register elimination is going
9066 to apply a second constant. We want the two constants folded
9067 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9068 if (virt_or_elim_regno_p (REGNO (op0
)))
9070 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
9071 NULL_RTX
, true, OPTAB_DIRECT
);
9072 return gen_rtx_PLUS (Pmode
, base
, op1
);
9075 /* Otherwise, in order to encourage CSE (and thence loop strength
9076 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9077 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
9078 NULL_RTX
, true, OPTAB_DIRECT
);
9079 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
9083 if (GET_MODE_SIZE (mode
).is_constant (&size
))
9085 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
9087 if (base_offset
!= 0)
9089 base
= plus_constant (Pmode
, base
, base_offset
);
9090 base
= force_operand (base
, NULL_RTX
);
9091 return plus_constant (Pmode
, base
, offset
- base_offset
);
9100 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
9103 secondary_reload_info
*sri
)
9105 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9106 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9107 comment at the head of aarch64-sve.md for more details about the
9108 big-endian handling. */
9109 if (BYTES_BIG_ENDIAN
9110 && reg_class_subset_p (rclass
, FP_REGS
)
9111 && !((REG_P (x
) && HARD_REGISTER_P (x
))
9112 || aarch64_simd_valid_immediate (x
, NULL
))
9113 && aarch64_sve_data_mode_p (mode
))
9115 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
9119 /* If we have to disable direct literal pool loads and stores because the
9120 function is too big, then we need a scratch register. */
9121 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
9122 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
9123 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
9124 && !aarch64_pcrelative_literal_loads
)
9126 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
9130 /* Without the TARGET_SIMD instructions we cannot move a Q register
9131 to a Q register directly. We need a scratch. */
9132 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
9133 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
9134 && reg_class_subset_p (rclass
, FP_REGS
))
9136 sri
->icode
= code_for_aarch64_reload_mov (mode
);
9140 /* A TFmode or TImode memory access should be handled via an FP_REGS
9141 because AArch64 has richer addressing modes for LDR/STR instructions
9142 than LDP/STP instructions. */
9143 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
9144 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
9147 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
9148 return GENERAL_REGS
;
9154 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
9156 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
9158 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9159 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9160 if (frame_pointer_needed
)
9161 return to
== HARD_FRAME_POINTER_REGNUM
;
9166 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
9168 if (to
== HARD_FRAME_POINTER_REGNUM
)
9170 if (from
== ARG_POINTER_REGNUM
)
9171 return cfun
->machine
->frame
.hard_fp_offset
;
9173 if (from
== FRAME_POINTER_REGNUM
)
9174 return cfun
->machine
->frame
.hard_fp_offset
9175 - cfun
->machine
->frame
.locals_offset
;
9178 if (to
== STACK_POINTER_REGNUM
)
9180 if (from
== FRAME_POINTER_REGNUM
)
9181 return cfun
->machine
->frame
.frame_size
9182 - cfun
->machine
->frame
.locals_offset
;
9185 return cfun
->machine
->frame
.frame_size
;
9188 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9192 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
9196 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
9201 aarch64_asm_trampoline_template (FILE *f
)
9206 if (aarch64_bti_enabled ())
9208 asm_fprintf (f
, "\thint\t34 // bti c\n");
9215 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", IP1_REGNUM
- R0_REGNUM
, offset1
);
9216 asm_fprintf (f
, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
,
9221 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[IP1_REGNUM
], offset1
);
9222 asm_fprintf (f
, "\tldr\t%s, .+%d\n", reg_names
[STATIC_CHAIN_REGNUM
],
9225 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
9227 /* The trampoline needs an extra padding instruction. In case if BTI is
9228 enabled the padding instruction is replaced by the BTI instruction at
9230 if (!aarch64_bti_enabled ())
9231 assemble_aligned_integer (4, const0_rtx
);
9233 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9234 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
9238 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
9240 rtx fnaddr
, mem
, a_tramp
;
9241 const int tramp_code_sz
= 16;
9243 /* Don't need to copy the trailing D-words, we fill those in below. */
9244 emit_block_move (m_tramp
, assemble_trampoline_template (),
9245 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
9246 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
9247 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
9248 if (GET_MODE (fnaddr
) != ptr_mode
)
9249 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
9250 emit_move_insn (mem
, fnaddr
);
9252 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
9253 emit_move_insn (mem
, chain_value
);
9255 /* XXX We should really define a "clear_cache" pattern and use
9256 gen_clear_cache(). */
9257 a_tramp
= XEXP (m_tramp
, 0);
9258 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
9259 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
9260 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
9264 static unsigned char
9265 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
9267 /* ??? Logically we should only need to provide a value when
9268 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9269 can hold MODE, but at the moment we need to handle all modes.
9270 Just ignore any runtime parts for registers that can't store them. */
9271 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
9275 case TAILCALL_ADDR_REGS
:
9279 case POINTER_AND_FP_REGS
:
9283 if (aarch64_sve_data_mode_p (mode
)
9284 && constant_multiple_p (GET_MODE_SIZE (mode
),
9285 BYTES_PER_SVE_VECTOR
, &nregs
))
9287 return (aarch64_vector_data_mode_p (mode
)
9288 ? CEIL (lowest_size
, UNITS_PER_VREG
)
9289 : CEIL (lowest_size
, UNITS_PER_WORD
));
9306 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
9308 if (regclass
== POINTER_REGS
)
9309 return GENERAL_REGS
;
9311 if (regclass
== STACK_REG
)
9314 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
9320 /* Register eliminiation can result in a request for
9321 SP+constant->FP_REGS. We cannot support such operations which
9322 use SP as source and an FP_REG as destination, so reject out
9324 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
9326 rtx lhs
= XEXP (x
, 0);
9328 /* Look through a possible SUBREG introduced by ILP32. */
9329 if (GET_CODE (lhs
) == SUBREG
)
9330 lhs
= SUBREG_REG (lhs
);
9332 gcc_assert (REG_P (lhs
));
9333 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
9342 aarch64_asm_output_labelref (FILE* f
, const char *name
)
9344 asm_fprintf (f
, "%U%s", name
);
9348 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
9350 if (priority
== DEFAULT_INIT_PRIORITY
)
9351 default_ctor_section_asm_out_constructor (symbol
, priority
);
9355 /* While priority is known to be in range [0, 65535], so 18 bytes
9356 would be enough, the compiler might not know that. To avoid
9357 -Wformat-truncation false positive, use a larger size. */
9359 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
9360 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9361 switch_to_section (s
);
9362 assemble_align (POINTER_SIZE
);
9363 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9368 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
9370 if (priority
== DEFAULT_INIT_PRIORITY
)
9371 default_dtor_section_asm_out_destructor (symbol
, priority
);
9375 /* While priority is known to be in range [0, 65535], so 18 bytes
9376 would be enough, the compiler might not know that. To avoid
9377 -Wformat-truncation false positive, use a larger size. */
9379 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
9380 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
9381 switch_to_section (s
);
9382 assemble_align (POINTER_SIZE
);
9383 assemble_aligned_integer (POINTER_BYTES
, symbol
);
9388 aarch64_output_casesi (rtx
*operands
)
9392 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
9394 static const char *const patterns
[4][2] =
9397 "ldrb\t%w3, [%0,%w1,uxtw]",
9398 "add\t%3, %4, %w3, sxtb #2"
9401 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9402 "add\t%3, %4, %w3, sxth #2"
9405 "ldr\t%w3, [%0,%w1,uxtw #2]",
9406 "add\t%3, %4, %w3, sxtw #2"
9408 /* We assume that DImode is only generated when not optimizing and
9409 that we don't really need 64-bit address offsets. That would
9410 imply an object file with 8GB of code in a single function! */
9412 "ldr\t%w3, [%0,%w1,uxtw #2]",
9413 "add\t%3, %4, %w3, sxtw #2"
9417 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
9419 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
9420 index
= exact_log2 (GET_MODE_SIZE (mode
));
9422 gcc_assert (index
>= 0 && index
<= 3);
9424 /* Need to implement table size reduction, by chaning the code below. */
9425 output_asm_insn (patterns
[index
][0], operands
);
9426 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
9427 snprintf (buf
, sizeof (buf
),
9428 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
9429 output_asm_insn (buf
, operands
);
9430 output_asm_insn (patterns
[index
][1], operands
);
9431 output_asm_insn ("br\t%3", operands
);
9432 assemble_label (asm_out_file
, label
);
9437 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9438 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9442 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
9444 if (shift
>= 0 && shift
<= 3)
9447 for (size
= 8; size
<= 32; size
*= 2)
9449 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
9450 if (mask
== bits
<< shift
)
9457 /* Constant pools are per function only when PC relative
9458 literal loads are true or we are in the large memory
9462 aarch64_can_use_per_function_literal_pools_p (void)
9464 return (aarch64_pcrelative_literal_loads
9465 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
9469 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
9471 /* We can't use blocks for constants when we're using a per-function
9473 return !aarch64_can_use_per_function_literal_pools_p ();
9476 /* Select appropriate section for constants depending
9477 on where we place literal pools. */
9480 aarch64_select_rtx_section (machine_mode mode
,
9482 unsigned HOST_WIDE_INT align
)
9484 if (aarch64_can_use_per_function_literal_pools_p ())
9485 return function_section (current_function_decl
);
9487 return default_elf_select_rtx_section (mode
, x
, align
);
9490 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9492 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
9493 HOST_WIDE_INT offset
)
9495 /* When using per-function literal pools, we must ensure that any code
9496 section is aligned to the minimal instruction length, lest we get
9497 errors from the assembler re "unaligned instructions". */
9498 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
9499 ASM_OUTPUT_ALIGN (f
, 2);
9504 /* Helper function for rtx cost calculation. Strip a shift expression
9505 from X. Returns the inner operand if successful, or the original
9506 expression on failure. */
9508 aarch64_strip_shift (rtx x
)
9512 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9513 we can convert both to ROR during final output. */
9514 if ((GET_CODE (op
) == ASHIFT
9515 || GET_CODE (op
) == ASHIFTRT
9516 || GET_CODE (op
) == LSHIFTRT
9517 || GET_CODE (op
) == ROTATERT
9518 || GET_CODE (op
) == ROTATE
)
9519 && CONST_INT_P (XEXP (op
, 1)))
9520 return XEXP (op
, 0);
9522 if (GET_CODE (op
) == MULT
9523 && CONST_INT_P (XEXP (op
, 1))
9524 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
9525 return XEXP (op
, 0);
9530 /* Helper function for rtx cost calculation. Strip an extend
9531 expression from X. Returns the inner operand if successful, or the
9532 original expression on failure. We deal with a number of possible
9533 canonicalization variations here. If STRIP_SHIFT is true, then
9534 we can strip off a shift also. */
9536 aarch64_strip_extend (rtx x
, bool strip_shift
)
9538 scalar_int_mode mode
;
9541 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
9544 /* Zero and sign extraction of a widened value. */
9545 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
9546 && XEXP (op
, 2) == const0_rtx
9547 && GET_CODE (XEXP (op
, 0)) == MULT
9548 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
9550 return XEXP (XEXP (op
, 0), 0);
9552 /* It can also be represented (for zero-extend) as an AND with an
9554 if (GET_CODE (op
) == AND
9555 && GET_CODE (XEXP (op
, 0)) == MULT
9556 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
9557 && CONST_INT_P (XEXP (op
, 1))
9558 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
9559 INTVAL (XEXP (op
, 1))) != 0)
9560 return XEXP (XEXP (op
, 0), 0);
9562 /* Now handle extended register, as this may also have an optional
9563 left shift by 1..4. */
9565 && GET_CODE (op
) == ASHIFT
9566 && CONST_INT_P (XEXP (op
, 1))
9567 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
9570 if (GET_CODE (op
) == ZERO_EXTEND
9571 || GET_CODE (op
) == SIGN_EXTEND
)
9580 /* Return true iff CODE is a shift supported in combination
9581 with arithmetic instructions. */
9584 aarch64_shift_p (enum rtx_code code
)
9586 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
9590 /* Return true iff X is a cheap shift without a sign extend. */
9593 aarch64_cheap_mult_shift_p (rtx x
)
9600 if (!(aarch64_tune_params
.extra_tuning_flags
9601 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
9604 if (GET_CODE (op0
) == SIGN_EXTEND
)
9607 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
9608 && UINTVAL (op1
) <= 4)
9611 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
9614 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
9616 if (l2
> 0 && l2
<= 4)
9622 /* Helper function for rtx cost calculation. Calculate the cost of
9623 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9624 Return the calculated cost of the expression, recursing manually in to
9625 operands where needed. */
9628 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
9631 const struct cpu_cost_table
*extra_cost
9632 = aarch64_tune_params
.insn_extra_cost
;
9634 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
9635 machine_mode mode
= GET_MODE (x
);
9637 gcc_checking_assert (code
== MULT
);
9642 if (VECTOR_MODE_P (mode
))
9643 mode
= GET_MODE_INNER (mode
);
9645 /* Integer multiply/fma. */
9646 if (GET_MODE_CLASS (mode
) == MODE_INT
)
9648 /* The multiply will be canonicalized as a shift, cost it as such. */
9649 if (aarch64_shift_p (GET_CODE (x
))
9650 || (CONST_INT_P (op1
)
9651 && exact_log2 (INTVAL (op1
)) > 0))
9653 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
9654 || GET_CODE (op0
) == SIGN_EXTEND
;
9659 /* If the shift is considered cheap,
9660 then don't add any cost. */
9661 if (aarch64_cheap_mult_shift_p (x
))
9663 else if (REG_P (op1
))
9664 /* ARITH + shift-by-register. */
9665 cost
+= extra_cost
->alu
.arith_shift_reg
;
9667 /* ARITH + extended register. We don't have a cost field
9668 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9669 cost
+= extra_cost
->alu
.extend_arith
;
9671 /* ARITH + shift-by-immediate. */
9672 cost
+= extra_cost
->alu
.arith_shift
;
9675 /* LSL (immediate). */
9676 cost
+= extra_cost
->alu
.shift
;
9679 /* Strip extends as we will have costed them in the case above. */
9681 op0
= aarch64_strip_extend (op0
, true);
9683 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
9688 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9689 compound and let the below cases handle it. After all, MNEG is a
9690 special-case alias of MSUB. */
9691 if (GET_CODE (op0
) == NEG
)
9693 op0
= XEXP (op0
, 0);
9697 /* Integer multiplies or FMAs have zero/sign extending variants. */
9698 if ((GET_CODE (op0
) == ZERO_EXTEND
9699 && GET_CODE (op1
) == ZERO_EXTEND
)
9700 || (GET_CODE (op0
) == SIGN_EXTEND
9701 && GET_CODE (op1
) == SIGN_EXTEND
))
9703 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
9704 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
9709 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9710 cost
+= extra_cost
->mult
[0].extend_add
;
9712 /* MUL/SMULL/UMULL. */
9713 cost
+= extra_cost
->mult
[0].extend
;
9719 /* This is either an integer multiply or a MADD. In both cases
9720 we want to recurse and cost the operands. */
9721 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9722 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9728 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
9731 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
9740 /* Floating-point FMA/FMUL can also support negations of the
9741 operands, unless the rounding mode is upward or downward in
9742 which case FNMUL is different than FMUL with operand negation. */
9743 bool neg0
= GET_CODE (op0
) == NEG
;
9744 bool neg1
= GET_CODE (op1
) == NEG
;
9745 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
9748 op0
= XEXP (op0
, 0);
9750 op1
= XEXP (op1
, 0);
9754 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9755 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9758 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
9761 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
9762 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
9768 aarch64_address_cost (rtx x
,
9770 addr_space_t as ATTRIBUTE_UNUSED
,
9773 enum rtx_code c
= GET_CODE (x
);
9774 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
9775 struct aarch64_address_info info
;
9779 if (!aarch64_classify_address (&info
, x
, mode
, false))
9781 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
9783 /* This is a CONST or SYMBOL ref which will be split
9784 in a different way depending on the code model in use.
9785 Cost it through the generic infrastructure. */
9786 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
9787 /* Divide through by the cost of one instruction to
9788 bring it to the same units as the address costs. */
9789 cost_symbol_ref
/= COSTS_N_INSNS (1);
9790 /* The cost is then the cost of preparing the address,
9791 followed by an immediate (possibly 0) offset. */
9792 return cost_symbol_ref
+ addr_cost
->imm_offset
;
9796 /* This is most likely a jump table from a case
9798 return addr_cost
->register_offset
;
9804 case ADDRESS_LO_SUM
:
9805 case ADDRESS_SYMBOLIC
:
9806 case ADDRESS_REG_IMM
:
9807 cost
+= addr_cost
->imm_offset
;
9810 case ADDRESS_REG_WB
:
9811 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
9812 cost
+= addr_cost
->pre_modify
;
9813 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
9814 cost
+= addr_cost
->post_modify
;
9820 case ADDRESS_REG_REG
:
9821 cost
+= addr_cost
->register_offset
;
9824 case ADDRESS_REG_SXTW
:
9825 cost
+= addr_cost
->register_sextend
;
9828 case ADDRESS_REG_UXTW
:
9829 cost
+= addr_cost
->register_zextend
;
9839 /* For the sake of calculating the cost of the shifted register
9840 component, we can treat same sized modes in the same way. */
9841 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
9842 cost
+= addr_cost
->addr_scale_costs
.hi
;
9843 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
9844 cost
+= addr_cost
->addr_scale_costs
.si
;
9845 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
9846 cost
+= addr_cost
->addr_scale_costs
.di
;
9848 /* We can't tell, or this is a 128-bit vector. */
9849 cost
+= addr_cost
->addr_scale_costs
.ti
;
9855 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9856 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9860 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
9862 /* When optimizing for speed, use the cost of unpredictable branches. */
9863 const struct cpu_branch_cost
*branch_costs
=
9864 aarch64_tune_params
.branch_costs
;
9866 if (!speed_p
|| predictable_p
)
9867 return branch_costs
->predictable
;
9869 return branch_costs
->unpredictable
;
9872 /* Return true if the RTX X in mode MODE is a zero or sign extract
9873 usable in an ADD or SUB (extended register) instruction. */
9875 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
9877 /* Catch add with a sign extract.
9878 This is add_<optab><mode>_multp2. */
9879 if (GET_CODE (x
) == SIGN_EXTRACT
9880 || GET_CODE (x
) == ZERO_EXTRACT
)
9882 rtx op0
= XEXP (x
, 0);
9883 rtx op1
= XEXP (x
, 1);
9884 rtx op2
= XEXP (x
, 2);
9886 if (GET_CODE (op0
) == MULT
9887 && CONST_INT_P (op1
)
9888 && op2
== const0_rtx
9889 && CONST_INT_P (XEXP (op0
, 1))
9890 && aarch64_is_extend_from_extract (mode
,
9897 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9899 else if (GET_CODE (x
) == SIGN_EXTEND
9900 || GET_CODE (x
) == ZERO_EXTEND
)
9901 return REG_P (XEXP (x
, 0));
9907 aarch64_frint_unspec_p (unsigned int u
)
9925 /* Return true iff X is an rtx that will match an extr instruction
9926 i.e. as described in the *extr<mode>5_insn family of patterns.
9927 OP0 and OP1 will be set to the operands of the shifts involved
9928 on success and will be NULL_RTX otherwise. */
9931 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
9934 scalar_int_mode mode
;
9935 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
9938 *res_op0
= NULL_RTX
;
9939 *res_op1
= NULL_RTX
;
9941 if (GET_CODE (x
) != IOR
)
9947 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
9948 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
9950 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9951 if (GET_CODE (op1
) == ASHIFT
)
9952 std::swap (op0
, op1
);
9954 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
9957 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
9958 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
9960 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
9961 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
9963 *res_op0
= XEXP (op0
, 0);
9964 *res_op1
= XEXP (op1
, 0);
9972 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9973 storing it in *COST. Result is true if the total cost of the operation
9974 has now been calculated. */
9976 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
9980 enum rtx_code cmpcode
;
9982 if (COMPARISON_P (op0
))
9984 inner
= XEXP (op0
, 0);
9985 comparator
= XEXP (op0
, 1);
9986 cmpcode
= GET_CODE (op0
);
9991 comparator
= const0_rtx
;
9995 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
9997 /* Conditional branch. */
9998 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10002 if (cmpcode
== NE
|| cmpcode
== EQ
)
10004 if (comparator
== const0_rtx
)
10006 /* TBZ/TBNZ/CBZ/CBNZ. */
10007 if (GET_CODE (inner
) == ZERO_EXTRACT
)
10009 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
10010 ZERO_EXTRACT
, 0, speed
);
10013 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
10018 else if (cmpcode
== LT
|| cmpcode
== GE
)
10021 if (comparator
== const0_rtx
)
10026 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
10029 if (GET_CODE (op1
) == COMPARE
)
10031 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10032 if (XEXP (op1
, 1) == const0_rtx
)
10036 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
10037 const struct cpu_cost_table
*extra_cost
10038 = aarch64_tune_params
.insn_extra_cost
;
10040 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10041 *cost
+= extra_cost
->alu
.arith
;
10043 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10048 /* It's a conditional operation based on the status flags,
10049 so it must be some flavor of CSEL. */
10051 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10052 if (GET_CODE (op1
) == NEG
10053 || GET_CODE (op1
) == NOT
10054 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
10055 op1
= XEXP (op1
, 0);
10056 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
10058 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10059 op1
= XEXP (op1
, 0);
10060 op2
= XEXP (op2
, 0);
10063 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
10064 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
10068 /* We don't know what this is, cost all operands. */
10072 /* Check whether X is a bitfield operation of the form shift + extend that
10073 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10074 operand to which the bitfield operation is applied. Otherwise return
10078 aarch64_extend_bitfield_pattern_p (rtx x
)
10080 rtx_code outer_code
= GET_CODE (x
);
10081 machine_mode outer_mode
= GET_MODE (x
);
10083 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
10084 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
10087 rtx inner
= XEXP (x
, 0);
10088 rtx_code inner_code
= GET_CODE (inner
);
10089 machine_mode inner_mode
= GET_MODE (inner
);
10092 switch (inner_code
)
10095 if (CONST_INT_P (XEXP (inner
, 1))
10096 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10097 op
= XEXP (inner
, 0);
10100 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10101 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10102 op
= XEXP (inner
, 0);
10105 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
10106 && (inner_mode
== QImode
|| inner_mode
== HImode
))
10107 op
= XEXP (inner
, 0);
10116 /* Return true if the mask and a shift amount from an RTX of the form
10117 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10118 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10121 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
10124 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
10125 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
10126 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
10128 & ((HOST_WIDE_INT_1U
<< INTVAL (shft_amnt
)) - 1)) == 0;
10131 /* Return true if the masks and a shift amount from an RTX of the form
10132 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10133 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10136 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode
,
10137 unsigned HOST_WIDE_INT mask1
,
10138 unsigned HOST_WIDE_INT shft_amnt
,
10139 unsigned HOST_WIDE_INT mask2
)
10141 unsigned HOST_WIDE_INT t
;
10143 /* Verify that there is no overlap in what bits are set in the two masks. */
10144 if (mask1
!= ~mask2
)
10147 /* Verify that mask2 is not all zeros or ones. */
10148 if (mask2
== 0 || mask2
== HOST_WIDE_INT_M1U
)
10151 /* The shift amount should always be less than the mode size. */
10152 gcc_assert (shft_amnt
< GET_MODE_BITSIZE (mode
));
10154 /* Verify that the mask being shifted is contiguous and would be in the
10155 least significant bits after shifting by shft_amnt. */
10156 t
= mask2
+ (HOST_WIDE_INT_1U
<< shft_amnt
);
10157 return (t
== (t
& -t
));
10160 /* Calculate the cost of calculating X, storing it in *COST. Result
10161 is true if the total cost of the operation has now been calculated. */
10163 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
10164 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
10167 const struct cpu_cost_table
*extra_cost
10168 = aarch64_tune_params
.insn_extra_cost
;
10169 int code
= GET_CODE (x
);
10170 scalar_int_mode int_mode
;
10172 /* By default, assume that everything has equivalent cost to the
10173 cheapest instruction. Any additional costs are applied as a delta
10174 above this default. */
10175 *cost
= COSTS_N_INSNS (1);
10180 /* The cost depends entirely on the operands to SET. */
10182 op0
= SET_DEST (x
);
10185 switch (GET_CODE (op0
))
10190 rtx address
= XEXP (op0
, 0);
10191 if (VECTOR_MODE_P (mode
))
10192 *cost
+= extra_cost
->ldst
.storev
;
10193 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10194 *cost
+= extra_cost
->ldst
.store
;
10195 else if (mode
== SFmode
)
10196 *cost
+= extra_cost
->ldst
.storef
;
10197 else if (mode
== DFmode
)
10198 *cost
+= extra_cost
->ldst
.stored
;
10201 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10205 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10209 if (! REG_P (SUBREG_REG (op0
)))
10210 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
10212 /* Fall through. */
10214 /* The cost is one per vector-register copied. */
10215 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
10217 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
10218 *cost
= COSTS_N_INSNS (nregs
);
10220 /* const0_rtx is in general free, but we will use an
10221 instruction to set a register to 0. */
10222 else if (REG_P (op1
) || op1
== const0_rtx
)
10224 /* The cost is 1 per register copied. */
10225 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
10226 *cost
= COSTS_N_INSNS (nregs
);
10229 /* Cost is just the cost of the RHS of the set. */
10230 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
10235 /* Bit-field insertion. Strip any redundant widening of
10236 the RHS to meet the width of the target. */
10237 if (GET_CODE (op1
) == SUBREG
)
10238 op1
= SUBREG_REG (op1
);
10239 if ((GET_CODE (op1
) == ZERO_EXTEND
10240 || GET_CODE (op1
) == SIGN_EXTEND
)
10241 && CONST_INT_P (XEXP (op0
, 1))
10242 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
10243 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
10244 op1
= XEXP (op1
, 0);
10246 if (CONST_INT_P (op1
))
10248 /* MOV immediate is assumed to always be cheap. */
10249 *cost
= COSTS_N_INSNS (1);
10255 *cost
+= extra_cost
->alu
.bfi
;
10256 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
10262 /* We can't make sense of this, assume default cost. */
10263 *cost
= COSTS_N_INSNS (1);
10269 /* If an instruction can incorporate a constant within the
10270 instruction, the instruction's expression avoids calling
10271 rtx_cost() on the constant. If rtx_cost() is called on a
10272 constant, then it is usually because the constant must be
10273 moved into a register by one or more instructions.
10275 The exception is constant 0, which can be expressed
10276 as XZR/WZR and is therefore free. The exception to this is
10277 if we have (set (reg) (const0_rtx)) in which case we must cost
10278 the move. However, we can catch that when we cost the SET, so
10279 we don't need to consider that here. */
10280 if (x
== const0_rtx
)
10284 /* To an approximation, building any other constant is
10285 proportionally expensive to the number of instructions
10286 required to build that constant. This is true whether we
10287 are compiling for SPEED or otherwise. */
10288 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
10289 int_mode
= word_mode
;
10290 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
10291 (NULL_RTX
, x
, false, int_mode
));
10297 /* First determine number of instructions to do the move
10298 as an integer constant. */
10299 if (!aarch64_float_const_representable_p (x
)
10300 && !aarch64_can_const_movi_rtx_p (x
, mode
)
10301 && aarch64_float_const_rtx_p (x
))
10303 unsigned HOST_WIDE_INT ival
;
10304 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
10305 gcc_assert (succeed
);
10307 scalar_int_mode imode
= (mode
== HFmode
10309 : int_mode_for_mode (mode
).require ());
10310 int ncost
= aarch64_internal_mov_immediate
10311 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
10312 *cost
+= COSTS_N_INSNS (ncost
);
10318 /* mov[df,sf]_aarch64. */
10319 if (aarch64_float_const_representable_p (x
))
10320 /* FMOV (scalar immediate). */
10321 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
10322 else if (!aarch64_float_const_zero_rtx_p (x
))
10324 /* This will be a load from memory. */
10325 if (mode
== DFmode
)
10326 *cost
+= extra_cost
->ldst
.loadd
;
10328 *cost
+= extra_cost
->ldst
.loadf
;
10331 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10332 or MOV v0.s[0], wzr - neither of which are modeled by the
10333 cost tables. Just use the default cost. */
10343 /* For loads we want the base cost of a load, plus an
10344 approximation for the additional cost of the addressing
10346 rtx address
= XEXP (x
, 0);
10347 if (VECTOR_MODE_P (mode
))
10348 *cost
+= extra_cost
->ldst
.loadv
;
10349 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10350 *cost
+= extra_cost
->ldst
.load
;
10351 else if (mode
== SFmode
)
10352 *cost
+= extra_cost
->ldst
.loadf
;
10353 else if (mode
== DFmode
)
10354 *cost
+= extra_cost
->ldst
.loadd
;
10357 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10366 if (VECTOR_MODE_P (mode
))
10371 *cost
+= extra_cost
->vect
.alu
;
10376 if (GET_MODE_CLASS (mode
) == MODE_INT
)
10378 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10379 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10382 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
10386 /* Cost this as SUB wzr, X. */
10387 op0
= CONST0_RTX (mode
);
10392 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10394 /* Support (neg(fma...)) as a single instruction only if
10395 sign of zeros is unimportant. This matches the decision
10396 making in aarch64.md. */
10397 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
10400 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10403 if (GET_CODE (op0
) == MULT
)
10406 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
10411 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
10421 if (VECTOR_MODE_P (mode
))
10422 *cost
+= extra_cost
->vect
.alu
;
10424 *cost
+= extra_cost
->alu
.clz
;
10433 if (op1
== const0_rtx
10434 && GET_CODE (op0
) == AND
)
10437 mode
= GET_MODE (op0
);
10441 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
10443 /* TODO: A write to the CC flags possibly costs extra, this
10444 needs encoding in the cost tables. */
10446 mode
= GET_MODE (op0
);
10448 if (GET_CODE (op0
) == AND
)
10454 if (GET_CODE (op0
) == PLUS
)
10456 /* ADDS (and CMN alias). */
10461 if (GET_CODE (op0
) == MINUS
)
10468 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
10469 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
10470 && CONST_INT_P (XEXP (op0
, 2)))
10472 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10473 Handle it here directly rather than going to cost_logic
10474 since we know the immediate generated for the TST is valid
10475 so we can avoid creating an intermediate rtx for it only
10476 for costing purposes. */
10478 *cost
+= extra_cost
->alu
.logical
;
10480 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
10481 ZERO_EXTRACT
, 0, speed
);
10485 if (GET_CODE (op1
) == NEG
)
10489 *cost
+= extra_cost
->alu
.arith
;
10491 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
10492 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
10498 Compare can freely swap the order of operands, and
10499 canonicalization puts the more complex operation first.
10500 But the integer MINUS logic expects the shift/extend
10501 operation in op1. */
10503 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
10511 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
10515 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
10517 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
10519 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
10520 /* FCMP supports constant 0.0 for no extra cost. */
10526 if (VECTOR_MODE_P (mode
))
10528 /* Vector compare. */
10530 *cost
+= extra_cost
->vect
.alu
;
10532 if (aarch64_float_const_zero_rtx_p (op1
))
10534 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10548 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
10550 /* Detect valid immediates. */
10551 if ((GET_MODE_CLASS (mode
) == MODE_INT
10552 || (GET_MODE_CLASS (mode
) == MODE_CC
10553 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
10554 && CONST_INT_P (op1
)
10555 && aarch64_uimm12_shift (INTVAL (op1
)))
10558 /* SUB(S) (immediate). */
10559 *cost
+= extra_cost
->alu
.arith
;
10563 /* Look for SUB (extended register). */
10564 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10565 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
10568 *cost
+= extra_cost
->alu
.extend_arith
;
10570 op1
= aarch64_strip_extend (op1
, true);
10571 *cost
+= rtx_cost (op1
, VOIDmode
,
10572 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
10576 rtx new_op1
= aarch64_strip_extend (op1
, false);
10578 /* Cost this as an FMA-alike operation. */
10579 if ((GET_CODE (new_op1
) == MULT
10580 || aarch64_shift_p (GET_CODE (new_op1
)))
10581 && code
!= COMPARE
)
10583 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
10584 (enum rtx_code
) code
,
10589 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
10593 if (VECTOR_MODE_P (mode
))
10596 *cost
+= extra_cost
->vect
.alu
;
10598 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10601 *cost
+= extra_cost
->alu
.arith
;
10603 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10606 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10620 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
10621 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
10624 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
10625 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10629 if (GET_MODE_CLASS (mode
) == MODE_INT
10630 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
10631 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
10633 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
10636 /* ADD (immediate). */
10637 *cost
+= extra_cost
->alu
.arith
;
10641 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
10643 /* Look for ADD (extended register). */
10644 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
10645 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
10648 *cost
+= extra_cost
->alu
.extend_arith
;
10650 op0
= aarch64_strip_extend (op0
, true);
10651 *cost
+= rtx_cost (op0
, VOIDmode
,
10652 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
10656 /* Strip any extend, leave shifts behind as we will
10657 cost them through mult_cost. */
10658 new_op0
= aarch64_strip_extend (op0
, false);
10660 if (GET_CODE (new_op0
) == MULT
10661 || aarch64_shift_p (GET_CODE (new_op0
)))
10663 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
10668 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
10672 if (VECTOR_MODE_P (mode
))
10675 *cost
+= extra_cost
->vect
.alu
;
10677 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
10680 *cost
+= extra_cost
->alu
.arith
;
10682 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
10685 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
10692 *cost
= COSTS_N_INSNS (1);
10696 if (VECTOR_MODE_P (mode
))
10697 *cost
+= extra_cost
->vect
.alu
;
10699 *cost
+= extra_cost
->alu
.rev
;
10704 if (aarch_rev16_p (x
))
10706 *cost
= COSTS_N_INSNS (1);
10710 if (VECTOR_MODE_P (mode
))
10711 *cost
+= extra_cost
->vect
.alu
;
10713 *cost
+= extra_cost
->alu
.rev
;
10718 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
10720 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
10721 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
10723 *cost
+= extra_cost
->alu
.shift
;
10727 /* Fall through. */
10734 if (VECTOR_MODE_P (mode
))
10737 *cost
+= extra_cost
->vect
.alu
;
10742 && GET_CODE (op0
) == MULT
10743 && CONST_INT_P (XEXP (op0
, 1))
10744 && CONST_INT_P (op1
)
10745 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
10746 INTVAL (op1
)) != 0)
10748 /* This is a UBFM/SBFM. */
10749 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
10751 *cost
+= extra_cost
->alu
.bfx
;
10755 if (is_int_mode (mode
, &int_mode
))
10757 if (CONST_INT_P (op1
))
10759 /* We have a mask + shift version of a UBFIZ
10760 i.e. the *andim_ashift<mode>_bfiz pattern. */
10761 if (GET_CODE (op0
) == ASHIFT
10762 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
10765 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
10766 (enum rtx_code
) code
, 0, speed
);
10768 *cost
+= extra_cost
->alu
.bfx
;
10772 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
10774 /* We possibly get the immediate for free, this is not
10776 *cost
+= rtx_cost (op0
, int_mode
,
10777 (enum rtx_code
) code
, 0, speed
);
10779 *cost
+= extra_cost
->alu
.logical
;
10788 /* Handle ORN, EON, or BIC. */
10789 if (GET_CODE (op0
) == NOT
)
10790 op0
= XEXP (op0
, 0);
10792 new_op0
= aarch64_strip_shift (op0
);
10794 /* If we had a shift on op0 then this is a logical-shift-
10795 by-register/immediate operation. Otherwise, this is just
10796 a logical operation. */
10799 if (new_op0
!= op0
)
10801 /* Shift by immediate. */
10802 if (CONST_INT_P (XEXP (op0
, 1)))
10803 *cost
+= extra_cost
->alu
.log_shift
;
10805 *cost
+= extra_cost
->alu
.log_shift_reg
;
10808 *cost
+= extra_cost
->alu
.logical
;
10811 /* In both cases we want to cost both operands. */
10812 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
10814 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
10824 op0
= aarch64_strip_shift (x
);
10826 if (VECTOR_MODE_P (mode
))
10829 *cost
+= extra_cost
->vect
.alu
;
10833 /* MVN-shifted-reg. */
10836 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
10839 *cost
+= extra_cost
->alu
.log_shift
;
10843 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10844 Handle the second form here taking care that 'a' in the above can
10846 else if (GET_CODE (op0
) == XOR
)
10848 rtx newop0
= XEXP (op0
, 0);
10849 rtx newop1
= XEXP (op0
, 1);
10850 rtx op0_stripped
= aarch64_strip_shift (newop0
);
10852 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
10853 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
10857 if (op0_stripped
!= newop0
)
10858 *cost
+= extra_cost
->alu
.log_shift
;
10860 *cost
+= extra_cost
->alu
.logical
;
10867 *cost
+= extra_cost
->alu
.logical
;
10874 /* If a value is written in SI mode, then zero extended to DI
10875 mode, the operation will in general be free as a write to
10876 a 'w' register implicitly zeroes the upper bits of an 'x'
10877 register. However, if this is
10879 (set (reg) (zero_extend (reg)))
10881 we must cost the explicit register move. */
10883 && GET_MODE (op0
) == SImode
10886 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
10888 /* If OP_COST is non-zero, then the cost of the zero extend
10889 is effectively the cost of the inner operation. Otherwise
10890 we have a MOV instruction and we take the cost from the MOV
10891 itself. This is true independently of whether we are
10892 optimizing for space or time. */
10898 else if (MEM_P (op0
))
10900 /* All loads can zero extend to any size for free. */
10901 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
10905 op0
= aarch64_extend_bitfield_pattern_p (x
);
10908 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
10910 *cost
+= extra_cost
->alu
.bfx
;
10916 if (VECTOR_MODE_P (mode
))
10919 *cost
+= extra_cost
->vect
.alu
;
10923 /* We generate an AND instead of UXTB/UXTH. */
10924 *cost
+= extra_cost
->alu
.logical
;
10930 if (MEM_P (XEXP (x
, 0)))
10935 rtx address
= XEXP (XEXP (x
, 0), 0);
10936 *cost
+= extra_cost
->ldst
.load_sign_extend
;
10939 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
10945 op0
= aarch64_extend_bitfield_pattern_p (x
);
10948 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
10950 *cost
+= extra_cost
->alu
.bfx
;
10956 if (VECTOR_MODE_P (mode
))
10957 *cost
+= extra_cost
->vect
.alu
;
10959 *cost
+= extra_cost
->alu
.extend
;
10967 if (CONST_INT_P (op1
))
10971 if (VECTOR_MODE_P (mode
))
10973 /* Vector shift (immediate). */
10974 *cost
+= extra_cost
->vect
.alu
;
10978 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10980 *cost
+= extra_cost
->alu
.shift
;
10984 /* We can incorporate zero/sign extend for free. */
10985 if (GET_CODE (op0
) == ZERO_EXTEND
10986 || GET_CODE (op0
) == SIGN_EXTEND
)
10987 op0
= XEXP (op0
, 0);
10989 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
10994 if (VECTOR_MODE_P (mode
))
10997 /* Vector shift (register). */
10998 *cost
+= extra_cost
->vect
.alu
;
11004 *cost
+= extra_cost
->alu
.shift_reg
;
11006 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11007 && CONST_INT_P (XEXP (op1
, 1))
11008 && known_eq (INTVAL (XEXP (op1
, 1)),
11009 GET_MODE_BITSIZE (mode
) - 1))
11011 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11012 /* We already demanded XEXP (op1, 0) to be REG_P, so
11013 don't recurse into it. */
11017 return false; /* All arguments need to be in registers. */
11027 if (CONST_INT_P (op1
))
11029 /* ASR (immediate) and friends. */
11032 if (VECTOR_MODE_P (mode
))
11033 *cost
+= extra_cost
->vect
.alu
;
11035 *cost
+= extra_cost
->alu
.shift
;
11038 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
11043 if (VECTOR_MODE_P (mode
))
11046 /* Vector shift (register). */
11047 *cost
+= extra_cost
->vect
.alu
;
11052 /* ASR (register) and friends. */
11053 *cost
+= extra_cost
->alu
.shift_reg
;
11055 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
11056 && CONST_INT_P (XEXP (op1
, 1))
11057 && known_eq (INTVAL (XEXP (op1
, 1)),
11058 GET_MODE_BITSIZE (mode
) - 1))
11060 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
11061 /* We already demanded XEXP (op1, 0) to be REG_P, so
11062 don't recurse into it. */
11066 return false; /* All arguments need to be in registers. */
11071 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
11072 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
11076 *cost
+= extra_cost
->ldst
.load
;
11078 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
11079 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
11081 /* ADRP, followed by ADD. */
11082 *cost
+= COSTS_N_INSNS (1);
11084 *cost
+= 2 * extra_cost
->alu
.arith
;
11086 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
11087 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
11091 *cost
+= extra_cost
->alu
.arith
;
11096 /* One extra load instruction, after accessing the GOT. */
11097 *cost
+= COSTS_N_INSNS (1);
11099 *cost
+= extra_cost
->ldst
.load
;
11105 /* ADRP/ADD (immediate). */
11107 *cost
+= extra_cost
->alu
.arith
;
11115 if (VECTOR_MODE_P (mode
))
11116 *cost
+= extra_cost
->vect
.alu
;
11118 *cost
+= extra_cost
->alu
.bfx
;
11121 /* We can trust that the immediates used will be correct (there
11122 are no by-register forms), so we need only cost op0. */
11123 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11127 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
11128 /* aarch64_rtx_mult_cost always handles recursion to its
11133 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11134 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11135 an unconditional negate. This case should only ever be reached through
11136 the set_smod_pow2_cheap check in expmed.c. */
11137 if (CONST_INT_P (XEXP (x
, 1))
11138 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
11139 && (mode
== SImode
|| mode
== DImode
))
11141 /* We expand to 4 instructions. Reset the baseline. */
11142 *cost
= COSTS_N_INSNS (4);
11145 *cost
+= 2 * extra_cost
->alu
.logical
11146 + 2 * extra_cost
->alu
.arith
;
11151 /* Fall-through. */
11155 /* Slighly prefer UMOD over SMOD. */
11156 if (VECTOR_MODE_P (mode
))
11157 *cost
+= extra_cost
->vect
.alu
;
11158 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11159 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
11160 + extra_cost
->mult
[mode
== DImode
].idiv
11161 + (code
== MOD
? 1 : 0));
11163 return false; /* All arguments need to be in registers. */
11170 if (VECTOR_MODE_P (mode
))
11171 *cost
+= extra_cost
->vect
.alu
;
11172 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
11173 /* There is no integer SQRT, so only DIV and UDIV can get
11175 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
11176 /* Slighly prefer UDIV over SDIV. */
11177 + (code
== DIV
? 1 : 0));
11179 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
11181 return false; /* All arguments need to be in registers. */
11184 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
11185 XEXP (x
, 2), cost
, speed
);
11198 return false; /* All arguments must be in registers. */
11207 if (VECTOR_MODE_P (mode
))
11208 *cost
+= extra_cost
->vect
.alu
;
11210 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
11213 /* FMSUB, FNMADD, and FNMSUB are free. */
11214 if (GET_CODE (op0
) == NEG
)
11215 op0
= XEXP (op0
, 0);
11217 if (GET_CODE (op2
) == NEG
)
11218 op2
= XEXP (op2
, 0);
11220 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11221 and the by-element operand as operand 0. */
11222 if (GET_CODE (op1
) == NEG
)
11223 op1
= XEXP (op1
, 0);
11225 /* Catch vector-by-element operations. The by-element operand can
11226 either be (vec_duplicate (vec_select (x))) or just
11227 (vec_select (x)), depending on whether we are multiplying by
11228 a vector or a scalar.
11230 Canonicalization is not very good in these cases, FMA4 will put the
11231 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11232 if (GET_CODE (op0
) == VEC_DUPLICATE
)
11233 op0
= XEXP (op0
, 0);
11234 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
11235 op1
= XEXP (op1
, 0);
11237 if (GET_CODE (op0
) == VEC_SELECT
)
11238 op0
= XEXP (op0
, 0);
11239 else if (GET_CODE (op1
) == VEC_SELECT
)
11240 op1
= XEXP (op1
, 0);
11242 /* If the remaining parameters are not registers,
11243 get the cost to put them into registers. */
11244 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
11245 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
11246 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
11250 case UNSIGNED_FLOAT
:
11252 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
11258 if (VECTOR_MODE_P (mode
))
11260 /*Vector truncate. */
11261 *cost
+= extra_cost
->vect
.alu
;
11264 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
11268 case FLOAT_TRUNCATE
:
11271 if (VECTOR_MODE_P (mode
))
11273 /*Vector conversion. */
11274 *cost
+= extra_cost
->vect
.alu
;
11277 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
11284 /* Strip the rounding part. They will all be implemented
11285 by the fcvt* family of instructions anyway. */
11286 if (GET_CODE (x
) == UNSPEC
)
11288 unsigned int uns_code
= XINT (x
, 1);
11290 if (uns_code
== UNSPEC_FRINTA
11291 || uns_code
== UNSPEC_FRINTM
11292 || uns_code
== UNSPEC_FRINTN
11293 || uns_code
== UNSPEC_FRINTP
11294 || uns_code
== UNSPEC_FRINTZ
)
11295 x
= XVECEXP (x
, 0, 0);
11300 if (VECTOR_MODE_P (mode
))
11301 *cost
+= extra_cost
->vect
.alu
;
11303 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
11306 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11307 fixed-point fcvt. */
11308 if (GET_CODE (x
) == MULT
11309 && ((VECTOR_MODE_P (mode
)
11310 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
11311 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
11313 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
11318 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
11322 if (VECTOR_MODE_P (mode
))
11324 /* ABS (vector). */
11326 *cost
+= extra_cost
->vect
.alu
;
11328 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11332 /* FABD, which is analogous to FADD. */
11333 if (GET_CODE (op0
) == MINUS
)
11335 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
11336 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
11338 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11342 /* Simple FABS is analogous to FNEG. */
11344 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
11348 /* Integer ABS will either be split to
11349 two arithmetic instructions, or will be an ABS
11350 (scalar), which we don't model. */
11351 *cost
= COSTS_N_INSNS (2);
11353 *cost
+= 2 * extra_cost
->alu
.arith
;
11361 if (VECTOR_MODE_P (mode
))
11362 *cost
+= extra_cost
->vect
.alu
;
11365 /* FMAXNM/FMINNM/FMAX/FMIN.
11366 TODO: This may not be accurate for all implementations, but
11367 we do not model this in the cost tables. */
11368 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
11374 /* The floating point round to integer frint* instructions. */
11375 if (aarch64_frint_unspec_p (XINT (x
, 1)))
11378 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
11383 if (XINT (x
, 1) == UNSPEC_RBIT
)
11386 *cost
+= extra_cost
->alu
.rev
;
11394 /* Decompose <su>muldi3_highpart. */
11395 if (/* (truncate:DI */
11398 && GET_MODE (XEXP (x
, 0)) == TImode
11399 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
11401 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
11402 /* (ANY_EXTEND:TI (reg:DI))
11403 (ANY_EXTEND:TI (reg:DI))) */
11404 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
11405 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
11406 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
11407 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
11408 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
11409 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
11410 /* (const_int 64) */
11411 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
11412 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
11416 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
11417 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
11418 mode
, MULT
, 0, speed
);
11419 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
11420 mode
, MULT
, 1, speed
);
11424 /* Fall through. */
11430 && flag_aarch64_verbose_cost
)
11431 fprintf (dump_file
,
11432 "\nFailed to cost RTX. Assuming default cost.\n");
11437 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11438 calculated for X. This cost is stored in *COST. Returns true
11439 if the total cost of X was calculated. */
11441 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
11442 int param
, int *cost
, bool speed
)
11444 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
11447 && flag_aarch64_verbose_cost
)
11449 print_rtl_single (dump_file
, x
);
11450 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
11451 speed
? "Hot" : "Cold",
11452 *cost
, result
? "final" : "partial");
11459 aarch64_register_move_cost (machine_mode mode
,
11460 reg_class_t from_i
, reg_class_t to_i
)
11462 enum reg_class from
= (enum reg_class
) from_i
;
11463 enum reg_class to
= (enum reg_class
) to_i
;
11464 const struct cpu_regmove_cost
*regmove_cost
11465 = aarch64_tune_params
.regmove_cost
;
11467 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11468 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
11471 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
11472 from
= GENERAL_REGS
;
11474 /* Moving between GPR and stack cost is the same as GP2GP. */
11475 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
11476 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
11477 return regmove_cost
->GP2GP
;
11479 /* To/From the stack register, we move via the gprs. */
11480 if (to
== STACK_REG
|| from
== STACK_REG
)
11481 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
11482 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
11484 if (known_eq (GET_MODE_SIZE (mode
), 16))
11486 /* 128-bit operations on general registers require 2 instructions. */
11487 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11488 return regmove_cost
->GP2GP
* 2;
11489 else if (from
== GENERAL_REGS
)
11490 return regmove_cost
->GP2FP
* 2;
11491 else if (to
== GENERAL_REGS
)
11492 return regmove_cost
->FP2GP
* 2;
11494 /* When AdvSIMD instructions are disabled it is not possible to move
11495 a 128-bit value directly between Q registers. This is handled in
11496 secondary reload. A general register is used as a scratch to move
11497 the upper DI value and the lower DI value is moved directly,
11498 hence the cost is the sum of three moves. */
11500 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
11502 return regmove_cost
->FP2FP
;
11505 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
11506 return regmove_cost
->GP2GP
;
11507 else if (from
== GENERAL_REGS
)
11508 return regmove_cost
->GP2FP
;
11509 else if (to
== GENERAL_REGS
)
11510 return regmove_cost
->FP2GP
;
11512 return regmove_cost
->FP2FP
;
11516 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
11517 reg_class_t rclass ATTRIBUTE_UNUSED
,
11518 bool in ATTRIBUTE_UNUSED
)
11520 return aarch64_tune_params
.memmov_cost
;
11523 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11524 to optimize 1.0/sqrt. */
11527 use_rsqrt_p (machine_mode mode
)
11529 return (!flag_trapping_math
11530 && flag_unsafe_math_optimizations
11531 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
11532 & AARCH64_APPROX_MODE (mode
))
11533 || flag_mrecip_low_precision_sqrt
));
11536 /* Function to decide when to use the approximate reciprocal square root
11540 aarch64_builtin_reciprocal (tree fndecl
)
11542 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
11544 if (!use_rsqrt_p (mode
))
11546 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl
));
11549 /* Emit instruction sequence to compute either the approximate square root
11550 or its approximate reciprocal, depending on the flag RECP, and return
11551 whether the sequence was emitted or not. */
11554 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
11556 machine_mode mode
= GET_MODE (dst
);
11558 if (GET_MODE_INNER (mode
) == HFmode
)
11560 gcc_assert (!recp
);
11566 if (!(flag_mlow_precision_sqrt
11567 || (aarch64_tune_params
.approx_modes
->sqrt
11568 & AARCH64_APPROX_MODE (mode
))))
11571 if (flag_finite_math_only
11572 || flag_trapping_math
11573 || !flag_unsafe_math_optimizations
11574 || optimize_function_for_size_p (cfun
))
11578 /* Caller assumes we cannot fail. */
11579 gcc_assert (use_rsqrt_p (mode
));
11581 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
11582 rtx xmsk
= gen_reg_rtx (mmsk
);
11584 /* When calculating the approximate square root, compare the
11585 argument with 0.0 and create a mask. */
11586 emit_insn (gen_rtx_SET (xmsk
,
11588 gen_rtx_EQ (mmsk
, src
,
11589 CONST0_RTX (mode
)))));
11591 /* Estimate the approximate reciprocal square root. */
11592 rtx xdst
= gen_reg_rtx (mode
);
11593 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
11595 /* Iterate over the series twice for SF and thrice for DF. */
11596 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11598 /* Optionally iterate over the series once less for faster performance
11599 while sacrificing the accuracy. */
11600 if ((recp
&& flag_mrecip_low_precision_sqrt
)
11601 || (!recp
&& flag_mlow_precision_sqrt
))
11604 /* Iterate over the series to calculate the approximate reciprocal square
11606 rtx x1
= gen_reg_rtx (mode
);
11607 while (iterations
--)
11609 rtx x2
= gen_reg_rtx (mode
);
11610 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
11612 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
11614 if (iterations
> 0)
11615 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
11620 /* Qualify the approximate reciprocal square root when the argument is
11621 0.0 by squashing the intermediary result to 0.0. */
11622 rtx xtmp
= gen_reg_rtx (mmsk
);
11623 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
11624 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
11625 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
11627 /* Calculate the approximate square root. */
11628 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
11631 /* Finalize the approximation. */
11632 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
11637 /* Emit the instruction sequence to compute the approximation for the division
11638 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11641 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
11643 machine_mode mode
= GET_MODE (quo
);
11645 if (GET_MODE_INNER (mode
) == HFmode
)
11648 bool use_approx_division_p
= (flag_mlow_precision_div
11649 || (aarch64_tune_params
.approx_modes
->division
11650 & AARCH64_APPROX_MODE (mode
)));
11652 if (!flag_finite_math_only
11653 || flag_trapping_math
11654 || !flag_unsafe_math_optimizations
11655 || optimize_function_for_size_p (cfun
)
11656 || !use_approx_division_p
)
11659 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
11662 /* Estimate the approximate reciprocal. */
11663 rtx xrcp
= gen_reg_rtx (mode
);
11664 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
11666 /* Iterate over the series twice for SF and thrice for DF. */
11667 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
11669 /* Optionally iterate over the series once less for faster performance,
11670 while sacrificing the accuracy. */
11671 if (flag_mlow_precision_div
)
11674 /* Iterate over the series to calculate the approximate reciprocal. */
11675 rtx xtmp
= gen_reg_rtx (mode
);
11676 while (iterations
--)
11678 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
11680 if (iterations
> 0)
11681 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11684 if (num
!= CONST1_RTX (mode
))
11686 /* As the approximate reciprocal of DEN is already calculated, only
11687 calculate the approximate division when NUM is not 1.0. */
11688 rtx xnum
= force_reg (mode
, num
);
11689 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
11692 /* Finalize the approximation. */
11693 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
11697 /* Return the number of instructions that can be issued per cycle. */
11699 aarch64_sched_issue_rate (void)
11701 return aarch64_tune_params
.issue_rate
;
11705 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11707 int issue_rate
= aarch64_sched_issue_rate ();
11709 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
11713 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11714 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11715 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11718 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
11721 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
11725 /* Vectorizer cost model target hooks. */
11727 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11729 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
11731 int misalign ATTRIBUTE_UNUSED
)
11734 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
11737 if (vectype
!= NULL
)
11738 fp
= FLOAT_TYPE_P (vectype
);
11740 switch (type_of_cost
)
11743 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
11746 return costs
->scalar_load_cost
;
11749 return costs
->scalar_store_cost
;
11752 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11755 return costs
->vec_align_load_cost
;
11758 return costs
->vec_store_cost
;
11760 case vec_to_scalar
:
11761 return costs
->vec_to_scalar_cost
;
11763 case scalar_to_vec
:
11764 return costs
->scalar_to_vec_cost
;
11766 case unaligned_load
:
11767 case vector_gather_load
:
11768 return costs
->vec_unalign_load_cost
;
11770 case unaligned_store
:
11771 case vector_scatter_store
:
11772 return costs
->vec_unalign_store_cost
;
11774 case cond_branch_taken
:
11775 return costs
->cond_taken_branch_cost
;
11777 case cond_branch_not_taken
:
11778 return costs
->cond_not_taken_branch_cost
;
11781 return costs
->vec_permute_cost
;
11783 case vec_promote_demote
:
11784 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
11786 case vec_construct
:
11787 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
11788 return elements
/ 2 + 1;
11791 gcc_unreachable ();
11795 /* Implement targetm.vectorize.add_stmt_cost. */
11797 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
11798 struct _stmt_vec_info
*stmt_info
, int misalign
,
11799 enum vect_cost_model_location where
)
11801 unsigned *cost
= (unsigned *) data
;
11802 unsigned retval
= 0;
11804 if (flag_vect_cost_model
)
11806 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
11808 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
11810 /* Statements in an inner loop relative to the loop being
11811 vectorized are weighted more heavily. The value here is
11812 arbitrary and could potentially be improved with analysis. */
11813 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
11814 count
*= 50; /* FIXME */
11816 retval
= (unsigned) (count
* stmt_cost
);
11817 cost
[where
] += retval
;
11823 static void initialize_aarch64_code_model (struct gcc_options
*);
11825 /* Parse the TO_PARSE string and put the architecture struct that it
11826 selects into RES and the architectural features into ISA_FLAGS.
11827 Return an aarch64_parse_opt_result describing the parse result.
11828 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11829 When the TO_PARSE string contains an invalid extension,
11830 a copy of the string is created and stored to INVALID_EXTENSION. */
11832 static enum aarch64_parse_opt_result
11833 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
11834 uint64_t *isa_flags
, std::string
*invalid_extension
)
11837 const struct processor
*arch
;
11840 ext
= strchr (to_parse
, '+');
11843 len
= ext
- to_parse
;
11845 len
= strlen (to_parse
);
11848 return AARCH64_PARSE_MISSING_ARG
;
11851 /* Loop through the list of supported ARCHes to find a match. */
11852 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
11854 if (strlen (arch
->name
) == len
11855 && strncmp (arch
->name
, to_parse
, len
) == 0)
11857 uint64_t isa_temp
= arch
->flags
;
11861 /* TO_PARSE string contains at least one extension. */
11862 enum aarch64_parse_opt_result ext_res
11863 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11865 if (ext_res
!= AARCH64_PARSE_OK
)
11868 /* Extension parsing was successful. Confirm the result
11869 arch and ISA flags. */
11871 *isa_flags
= isa_temp
;
11872 return AARCH64_PARSE_OK
;
11876 /* ARCH name not found in list. */
11877 return AARCH64_PARSE_INVALID_ARG
;
11880 /* Parse the TO_PARSE string and put the result tuning in RES and the
11881 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11882 describing the parse result. If there is an error parsing, RES and
11883 ISA_FLAGS are left unchanged.
11884 When the TO_PARSE string contains an invalid extension,
11885 a copy of the string is created and stored to INVALID_EXTENSION. */
11887 static enum aarch64_parse_opt_result
11888 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
11889 uint64_t *isa_flags
, std::string
*invalid_extension
)
11892 const struct processor
*cpu
;
11895 ext
= strchr (to_parse
, '+');
11898 len
= ext
- to_parse
;
11900 len
= strlen (to_parse
);
11903 return AARCH64_PARSE_MISSING_ARG
;
11906 /* Loop through the list of supported CPUs to find a match. */
11907 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11909 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, to_parse
, len
) == 0)
11911 uint64_t isa_temp
= cpu
->flags
;
11916 /* TO_PARSE string contains at least one extension. */
11917 enum aarch64_parse_opt_result ext_res
11918 = aarch64_parse_extension (ext
, &isa_temp
, invalid_extension
);
11920 if (ext_res
!= AARCH64_PARSE_OK
)
11923 /* Extension parsing was successfull. Confirm the result
11924 cpu and ISA flags. */
11926 *isa_flags
= isa_temp
;
11927 return AARCH64_PARSE_OK
;
11931 /* CPU name not found in list. */
11932 return AARCH64_PARSE_INVALID_ARG
;
11935 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11936 Return an aarch64_parse_opt_result describing the parse result.
11937 If the parsing fails the RES does not change. */
11939 static enum aarch64_parse_opt_result
11940 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
11942 const struct processor
*cpu
;
11944 /* Loop through the list of supported CPUs to find a match. */
11945 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
11947 if (strcmp (cpu
->name
, to_parse
) == 0)
11950 return AARCH64_PARSE_OK
;
11954 /* CPU name not found in list. */
11955 return AARCH64_PARSE_INVALID_ARG
;
11958 /* Parse TOKEN, which has length LENGTH to see if it is an option
11959 described in FLAG. If it is, return the index bit for that fusion type.
11960 If not, error (printing OPTION_NAME) and return zero. */
11962 static unsigned int
11963 aarch64_parse_one_option_token (const char *token
,
11965 const struct aarch64_flag_desc
*flag
,
11966 const char *option_name
)
11968 for (; flag
->name
!= NULL
; flag
++)
11970 if (length
== strlen (flag
->name
)
11971 && !strncmp (flag
->name
, token
, length
))
11975 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name
, token
);
11979 /* Parse OPTION which is a comma-separated list of flags to enable.
11980 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11981 default state we inherit from the CPU tuning structures. OPTION_NAME
11982 gives the top-level option we are parsing in the -moverride string,
11983 for use in error messages. */
11985 static unsigned int
11986 aarch64_parse_boolean_options (const char *option
,
11987 const struct aarch64_flag_desc
*flags
,
11988 unsigned int initial_state
,
11989 const char *option_name
)
11991 const char separator
= '.';
11992 const char* specs
= option
;
11993 const char* ntoken
= option
;
11994 unsigned int found_flags
= initial_state
;
11996 while ((ntoken
= strchr (specs
, separator
)))
11998 size_t token_length
= ntoken
- specs
;
11999 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12003 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12004 in the token stream, reset the supported operations. So:
12006 adrp+add.cmp+branch.none.adrp+add
12008 would have the result of turning on only adrp+add fusion. */
12012 found_flags
|= token_ops
;
12016 /* We ended with a comma, print something. */
12019 error ("%s string ill-formed\n", option_name
);
12023 /* We still have one more token to parse. */
12024 size_t token_length
= strlen (specs
);
12025 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
12032 found_flags
|= token_ops
;
12033 return found_flags
;
12036 /* Support for overriding instruction fusion. */
12039 aarch64_parse_fuse_string (const char *fuse_string
,
12040 struct tune_params
*tune
)
12042 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
12043 aarch64_fusible_pairs
,
12048 /* Support for overriding other tuning flags. */
12051 aarch64_parse_tune_string (const char *tune_string
,
12052 struct tune_params
*tune
)
12054 tune
->extra_tuning_flags
12055 = aarch64_parse_boolean_options (tune_string
,
12056 aarch64_tuning_flags
,
12057 tune
->extra_tuning_flags
,
12061 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12062 Accept the valid SVE vector widths allowed by
12063 aarch64_sve_vector_bits_enum and use it to override sve_width
12067 aarch64_parse_sve_width_string (const char *tune_string
,
12068 struct tune_params
*tune
)
12072 int n
= sscanf (tune_string
, "%d", &width
);
12075 error ("invalid format for sve_width");
12087 error ("invalid sve_width value: %d", width
);
12089 tune
->sve_width
= (enum aarch64_sve_vector_bits_enum
) width
;
12092 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12093 we understand. If it is, extract the option string and handoff to
12094 the appropriate function. */
12097 aarch64_parse_one_override_token (const char* token
,
12099 struct tune_params
*tune
)
12101 const struct aarch64_tuning_override_function
*fn
12102 = aarch64_tuning_override_functions
;
12104 const char *option_part
= strchr (token
, '=');
12107 error ("tuning string missing in option (%s)", token
);
12111 /* Get the length of the option name. */
12112 length
= option_part
- token
;
12113 /* Skip the '=' to get to the option string. */
12116 for (; fn
->name
!= NULL
; fn
++)
12118 if (!strncmp (fn
->name
, token
, length
))
12120 fn
->parse_override (option_part
, tune
);
12125 error ("unknown tuning option (%s)",token
);
12129 /* A checking mechanism for the implementation of the tls size. */
12132 initialize_aarch64_tls_size (struct gcc_options
*opts
)
12134 if (aarch64_tls_size
== 0)
12135 aarch64_tls_size
= 24;
12137 switch (opts
->x_aarch64_cmodel_var
)
12139 case AARCH64_CMODEL_TINY
:
12140 /* Both the default and maximum TLS size allowed under tiny is 1M which
12141 needs two instructions to address, so we clamp the size to 24. */
12142 if (aarch64_tls_size
> 24)
12143 aarch64_tls_size
= 24;
12145 case AARCH64_CMODEL_SMALL
:
12146 /* The maximum TLS size allowed under small is 4G. */
12147 if (aarch64_tls_size
> 32)
12148 aarch64_tls_size
= 32;
12150 case AARCH64_CMODEL_LARGE
:
12151 /* The maximum TLS size allowed under large is 16E.
12152 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12153 if (aarch64_tls_size
> 48)
12154 aarch64_tls_size
= 48;
12157 gcc_unreachable ();
12163 /* Parse STRING looking for options in the format:
12164 string :: option:string
12165 option :: name=substring
12167 substring :: defined by option. */
12170 aarch64_parse_override_string (const char* input_string
,
12171 struct tune_params
* tune
)
12173 const char separator
= ':';
12174 size_t string_length
= strlen (input_string
) + 1;
12175 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
12176 char *string
= string_root
;
12177 strncpy (string
, input_string
, string_length
);
12178 string
[string_length
- 1] = '\0';
12180 char* ntoken
= string
;
12182 while ((ntoken
= strchr (string
, separator
)))
12184 size_t token_length
= ntoken
- string
;
12185 /* Make this substring look like a string. */
12187 aarch64_parse_one_override_token (string
, token_length
, tune
);
12191 /* One last option to parse. */
12192 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
12193 free (string_root
);
12198 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
12200 if (accepted_branch_protection_string
)
12202 opts
->x_aarch64_branch_protection_string
12203 = xstrdup (accepted_branch_protection_string
);
12206 /* PR 70044: We have to be careful about being called multiple times for the
12207 same function. This means all changes should be repeatable. */
12209 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12210 Disable the frame pointer flag so the mid-end will not use a frame
12211 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12212 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12213 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12214 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
12215 if (opts
->x_flag_omit_frame_pointer
== 0)
12216 opts
->x_flag_omit_frame_pointer
= 2;
12218 /* If not optimizing for size, set the default
12219 alignment to what the target wants. */
12220 if (!opts
->x_optimize_size
)
12222 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
12223 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
12224 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
12225 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
12226 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
12227 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
12230 /* We default to no pc-relative literal loads. */
12232 aarch64_pcrelative_literal_loads
= false;
12234 /* If -mpc-relative-literal-loads is set on the command line, this
12235 implies that the user asked for PC relative literal loads. */
12236 if (opts
->x_pcrelative_literal_loads
== 1)
12237 aarch64_pcrelative_literal_loads
= true;
12239 /* In the tiny memory model it makes no sense to disallow PC relative
12240 literal pool loads. */
12241 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
12242 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
12243 aarch64_pcrelative_literal_loads
= true;
12245 /* When enabling the lower precision Newton series for the square root, also
12246 enable it for the reciprocal square root, since the latter is an
12247 intermediary step for the former. */
12248 if (flag_mlow_precision_sqrt
)
12249 flag_mrecip_low_precision_sqrt
= true;
12252 /* 'Unpack' up the internal tuning structs and update the options
12253 in OPTS. The caller must have set up selected_tune and selected_arch
12254 as all the other target-specific codegen decisions are
12255 derived from them. */
12258 aarch64_override_options_internal (struct gcc_options
*opts
)
12260 aarch64_tune_flags
= selected_tune
->flags
;
12261 aarch64_tune
= selected_tune
->sched_core
;
12262 /* Make a copy of the tuning parameters attached to the core, which
12263 we may later overwrite. */
12264 aarch64_tune_params
= *(selected_tune
->tune
);
12265 aarch64_architecture_version
= selected_arch
->architecture_version
;
12267 if (opts
->x_aarch64_override_tune_string
)
12268 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
12269 &aarch64_tune_params
);
12271 /* This target defaults to strict volatile bitfields. */
12272 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
12273 opts
->x_flag_strict_volatile_bitfields
= 1;
12275 if (aarch64_stack_protector_guard
== SSP_GLOBAL
12276 && opts
->x_aarch64_stack_protector_guard_offset_str
)
12278 error ("incompatible options %<-mstack-protector-guard=global%> and "
12279 "%<-mstack-protector-guard-offset=%s%>",
12280 aarch64_stack_protector_guard_offset_str
);
12283 if (aarch64_stack_protector_guard
== SSP_SYSREG
12284 && !(opts
->x_aarch64_stack_protector_guard_offset_str
12285 && opts
->x_aarch64_stack_protector_guard_reg_str
))
12287 error ("both %<-mstack-protector-guard-offset%> and "
12288 "%<-mstack-protector-guard-reg%> must be used "
12289 "with %<-mstack-protector-guard=sysreg%>");
12292 if (opts
->x_aarch64_stack_protector_guard_reg_str
)
12294 if (strlen (opts
->x_aarch64_stack_protector_guard_reg_str
) > 100)
12295 error ("specify a system register with a small string length.");
12298 if (opts
->x_aarch64_stack_protector_guard_offset_str
)
12301 const char *str
= aarch64_stack_protector_guard_offset_str
;
12303 long offs
= strtol (aarch64_stack_protector_guard_offset_str
, &end
, 0);
12304 if (!*str
|| *end
|| errno
)
12305 error ("%qs is not a valid offset in %qs", str
,
12306 "-mstack-protector-guard-offset=");
12307 aarch64_stack_protector_guard_offset
= offs
;
12310 initialize_aarch64_code_model (opts
);
12311 initialize_aarch64_tls_size (opts
);
12313 int queue_depth
= 0;
12314 switch (aarch64_tune_params
.autoprefetcher_model
)
12316 case tune_params::AUTOPREFETCHER_OFF
:
12319 case tune_params::AUTOPREFETCHER_WEAK
:
12322 case tune_params::AUTOPREFETCHER_STRONG
:
12323 queue_depth
= max_insn_queue_index
+ 1;
12326 gcc_unreachable ();
12329 /* We don't mind passing in global_options_set here as we don't use
12330 the *options_set structs anyway. */
12331 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
12333 opts
->x_param_values
,
12334 global_options_set
.x_param_values
);
12336 /* Set up parameters to be used in prefetching algorithm. Do not
12337 override the defaults unless we are tuning for a core we have
12338 researched values for. */
12339 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
12340 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
12341 aarch64_tune_params
.prefetch
->num_slots
,
12342 opts
->x_param_values
,
12343 global_options_set
.x_param_values
);
12344 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
12345 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
12346 aarch64_tune_params
.prefetch
->l1_cache_size
,
12347 opts
->x_param_values
,
12348 global_options_set
.x_param_values
);
12349 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
12350 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
12351 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
12352 opts
->x_param_values
,
12353 global_options_set
.x_param_values
);
12354 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
12355 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
12356 aarch64_tune_params
.prefetch
->l2_cache_size
,
12357 opts
->x_param_values
,
12358 global_options_set
.x_param_values
);
12359 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
12360 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
12362 opts
->x_param_values
,
12363 global_options_set
.x_param_values
);
12364 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
12365 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
12366 aarch64_tune_params
.prefetch
->minimum_stride
,
12367 opts
->x_param_values
,
12368 global_options_set
.x_param_values
);
12370 /* Use the alternative scheduling-pressure algorithm by default. */
12371 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
12372 opts
->x_param_values
,
12373 global_options_set
.x_param_values
);
12375 /* If the user hasn't changed it via configure then set the default to 64 KB
12376 for the backend. */
12377 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
,
12378 DEFAULT_STK_CLASH_GUARD_SIZE
== 0
12379 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE
,
12380 opts
->x_param_values
,
12381 global_options_set
.x_param_values
);
12383 /* Validate the guard size. */
12384 int guard_size
= PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE
);
12386 /* Enforce that interval is the same size as size so the mid-end does the
12388 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
,
12390 opts
->x_param_values
,
12391 global_options_set
.x_param_values
);
12393 /* The maybe_set calls won't update the value if the user has explicitly set
12394 one. Which means we need to validate that probing interval and guard size
12397 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL
);
12398 if (guard_size
!= probe_interval
)
12399 error ("stack clash guard size %<%d%> must be equal to probing interval "
12400 "%<%d%>", guard_size
, probe_interval
);
12402 /* Enable sw prefetching at specified optimization level for
12403 CPUS that have prefetch. Lower optimization level threshold by 1
12404 when profiling is enabled. */
12405 if (opts
->x_flag_prefetch_loop_arrays
< 0
12406 && !opts
->x_optimize_size
12407 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
12408 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
12409 opts
->x_flag_prefetch_loop_arrays
= 1;
12411 if (opts
->x_aarch64_arch_string
== NULL
)
12412 opts
->x_aarch64_arch_string
= selected_arch
->name
;
12413 if (opts
->x_aarch64_cpu_string
== NULL
)
12414 opts
->x_aarch64_cpu_string
= selected_cpu
->name
;
12415 if (opts
->x_aarch64_tune_string
== NULL
)
12416 opts
->x_aarch64_tune_string
= selected_tune
->name
;
12418 aarch64_override_options_after_change_1 (opts
);
12421 /* Print a hint with a suggestion for a core or architecture name that
12422 most closely resembles what the user passed in STR. ARCH is true if
12423 the user is asking for an architecture name. ARCH is false if the user
12424 is asking for a core name. */
12427 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
12429 auto_vec
<const char *> candidates
;
12430 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
12431 for (; entry
->name
!= NULL
; entry
++)
12432 candidates
.safe_push (entry
->name
);
12434 #ifdef HAVE_LOCAL_CPU_DETECT
12435 /* Add also "native" as possible value. */
12437 candidates
.safe_push ("native");
12441 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
12443 inform (input_location
, "valid arguments are: %s;"
12444 " did you mean %qs?", s
, hint
);
12446 inform (input_location
, "valid arguments are: %s", s
);
12451 /* Print a hint with a suggestion for a core name that most closely resembles
12452 what the user passed in STR. */
12455 aarch64_print_hint_for_core (const char *str
)
12457 aarch64_print_hint_for_core_or_arch (str
, false);
12460 /* Print a hint with a suggestion for an architecture name that most closely
12461 resembles what the user passed in STR. */
12464 aarch64_print_hint_for_arch (const char *str
)
12466 aarch64_print_hint_for_core_or_arch (str
, true);
12470 /* Print a hint with a suggestion for an extension name
12471 that most closely resembles what the user passed in STR. */
12474 aarch64_print_hint_for_extensions (const std::string
&str
)
12476 auto_vec
<const char *> candidates
;
12477 aarch64_get_all_extension_candidates (&candidates
);
12479 const char *hint
= candidates_list_and_hint (str
.c_str (), s
, candidates
);
12481 inform (input_location
, "valid arguments are: %s;"
12482 " did you mean %qs?", s
, hint
);
12484 inform (input_location
, "valid arguments are: %s;", s
);
12489 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12490 specified in STR and throw errors if appropriate. Put the results if
12491 they are valid in RES and ISA_FLAGS. Return whether the option is
12495 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
12496 uint64_t *isa_flags
)
12498 std::string invalid_extension
;
12499 enum aarch64_parse_opt_result parse_res
12500 = aarch64_parse_cpu (str
, res
, isa_flags
, &invalid_extension
);
12502 if (parse_res
== AARCH64_PARSE_OK
)
12507 case AARCH64_PARSE_MISSING_ARG
:
12508 error ("missing cpu name in %<-mcpu=%s%>", str
);
12510 case AARCH64_PARSE_INVALID_ARG
:
12511 error ("unknown value %qs for %<-mcpu%>", str
);
12512 aarch64_print_hint_for_core (str
);
12514 case AARCH64_PARSE_INVALID_FEATURE
:
12515 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12516 invalid_extension
.c_str (), str
);
12517 aarch64_print_hint_for_extensions (invalid_extension
);
12520 gcc_unreachable ();
12526 /* Parses CONST_STR for branch protection features specified in
12527 aarch64_branch_protect_types, and set any global variables required. Returns
12528 the parsing result and assigns LAST_STR to the last processed token from
12529 CONST_STR so that it can be used for error reporting. */
12532 aarch64_parse_opt_result
aarch64_parse_branch_protection (const char *const_str
,
12535 char *str_root
= xstrdup (const_str
);
12536 char* token_save
= NULL
;
12537 char *str
= strtok_r (str_root
, "+", &token_save
);
12538 enum aarch64_parse_opt_result res
= AARCH64_PARSE_OK
;
12540 res
= AARCH64_PARSE_MISSING_ARG
;
12543 char *next_str
= strtok_r (NULL
, "+", &token_save
);
12544 /* Reset the branch protection features to their defaults. */
12545 aarch64_handle_no_branch_protection (NULL
, NULL
);
12547 while (str
&& res
== AARCH64_PARSE_OK
)
12549 const aarch64_branch_protect_type
* type
= aarch64_branch_protect_types
;
12550 bool found
= false;
12551 /* Search for this type. */
12552 while (type
&& type
->name
&& !found
&& res
== AARCH64_PARSE_OK
)
12554 if (strcmp (str
, type
->name
) == 0)
12557 res
= type
->handler (str
, next_str
);
12559 next_str
= strtok_r (NULL
, "+", &token_save
);
12564 if (found
&& res
== AARCH64_PARSE_OK
)
12566 bool found_subtype
= true;
12567 /* Loop through each token until we find one that isn't a
12569 while (found_subtype
)
12571 found_subtype
= false;
12572 const aarch64_branch_protect_type
*subtype
= type
->subtypes
;
12573 /* Search for the subtype. */
12574 while (str
&& subtype
&& subtype
->name
&& !found_subtype
12575 && res
== AARCH64_PARSE_OK
)
12577 if (strcmp (str
, subtype
->name
) == 0)
12579 found_subtype
= true;
12580 res
= subtype
->handler (str
, next_str
);
12582 next_str
= strtok_r (NULL
, "+", &token_save
);
12590 res
= AARCH64_PARSE_INVALID_ARG
;
12593 /* Copy the last processed token into the argument to pass it back.
12594 Used by option and attribute validation to print the offending token. */
12597 if (str
) strcpy (*last_str
, str
);
12598 else *last_str
= NULL
;
12600 if (res
== AARCH64_PARSE_OK
)
12602 /* If needed, alloc the accepted string then copy in const_str.
12603 Used by override_option_after_change_1. */
12604 if (!accepted_branch_protection_string
)
12605 accepted_branch_protection_string
= (char *) xmalloc (
12606 BRANCH_PROTECT_STR_MAX
12608 strncpy (accepted_branch_protection_string
, const_str
,
12609 BRANCH_PROTECT_STR_MAX
+ 1);
12610 /* Forcibly null-terminate. */
12611 accepted_branch_protection_string
[BRANCH_PROTECT_STR_MAX
] = '\0';
12617 aarch64_validate_mbranch_protection (const char *const_str
)
12619 char *str
= (char *) xmalloc (strlen (const_str
));
12620 enum aarch64_parse_opt_result res
=
12621 aarch64_parse_branch_protection (const_str
, &str
);
12622 if (res
== AARCH64_PARSE_INVALID_ARG
)
12623 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str
);
12624 else if (res
== AARCH64_PARSE_MISSING_ARG
)
12625 error ("missing argument for %<-mbranch-protection=%>");
12627 return res
== AARCH64_PARSE_OK
;
12630 /* Validate a command-line -march option. Parse the arch and extensions
12631 (if any) specified in STR and throw errors if appropriate. Put the
12632 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12633 option is valid. */
12636 aarch64_validate_march (const char *str
, const struct processor
**res
,
12637 uint64_t *isa_flags
)
12639 std::string invalid_extension
;
12640 enum aarch64_parse_opt_result parse_res
12641 = aarch64_parse_arch (str
, res
, isa_flags
, &invalid_extension
);
12643 if (parse_res
== AARCH64_PARSE_OK
)
12648 case AARCH64_PARSE_MISSING_ARG
:
12649 error ("missing arch name in %<-march=%s%>", str
);
12651 case AARCH64_PARSE_INVALID_ARG
:
12652 error ("unknown value %qs for %<-march%>", str
);
12653 aarch64_print_hint_for_arch (str
);
12655 case AARCH64_PARSE_INVALID_FEATURE
:
12656 error ("invalid feature modifier %qs in %<-march=%s%>",
12657 invalid_extension
.c_str (), str
);
12658 aarch64_print_hint_for_extensions (invalid_extension
);
12661 gcc_unreachable ();
12667 /* Validate a command-line -mtune option. Parse the cpu
12668 specified in STR and throw errors if appropriate. Put the
12669 result, if it is valid, in RES. Return whether the option is
12673 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
12675 enum aarch64_parse_opt_result parse_res
12676 = aarch64_parse_tune (str
, res
);
12678 if (parse_res
== AARCH64_PARSE_OK
)
12683 case AARCH64_PARSE_MISSING_ARG
:
12684 error ("missing cpu name in %<-mtune=%s%>", str
);
12686 case AARCH64_PARSE_INVALID_ARG
:
12687 error ("unknown value %qs for %<-mtune%>", str
);
12688 aarch64_print_hint_for_core (str
);
12691 gcc_unreachable ();
12696 /* Return the CPU corresponding to the enum CPU.
12697 If it doesn't specify a cpu, return the default. */
12699 static const struct processor
*
12700 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
12702 if (cpu
!= aarch64_none
)
12703 return &all_cores
[cpu
];
12705 /* The & 0x3f is to extract the bottom 6 bits that encode the
12706 default cpu as selected by the --with-cpu GCC configure option
12708 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12709 flags mechanism should be reworked to make it more sane. */
12710 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12713 /* Return the architecture corresponding to the enum ARCH.
12714 If it doesn't specify a valid architecture, return the default. */
12716 static const struct processor
*
12717 aarch64_get_arch (enum aarch64_arch arch
)
12719 if (arch
!= aarch64_no_arch
)
12720 return &all_architectures
[arch
];
12722 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
12724 return &all_architectures
[cpu
->arch
];
12727 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12730 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
12732 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12733 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12734 deciding which .md file patterns to use and when deciding whether
12735 something is a legitimate address or constant. */
12736 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
12737 return poly_uint16 (2, 2);
12739 return (int) value
/ 64;
12742 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12743 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12744 tuning structs. In particular it must set selected_tune and
12745 aarch64_isa_flags that define the available ISA features and tuning
12746 decisions. It must also set selected_arch as this will be used to
12747 output the .arch asm tags for each function. */
12750 aarch64_override_options (void)
12752 uint64_t cpu_isa
= 0;
12753 uint64_t arch_isa
= 0;
12754 aarch64_isa_flags
= 0;
12756 bool valid_cpu
= true;
12757 bool valid_tune
= true;
12758 bool valid_arch
= true;
12760 selected_cpu
= NULL
;
12761 selected_arch
= NULL
;
12762 selected_tune
= NULL
;
12764 if (aarch64_branch_protection_string
)
12765 aarch64_validate_mbranch_protection (aarch64_branch_protection_string
);
12767 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12768 If either of -march or -mtune is given, they override their
12769 respective component of -mcpu. */
12770 if (aarch64_cpu_string
)
12771 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
12774 if (aarch64_arch_string
)
12775 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
12778 if (aarch64_tune_string
)
12779 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
12781 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12782 SUBTARGET_OVERRIDE_OPTIONS
;
12785 /* If the user did not specify a processor, choose the default
12786 one for them. This will be the CPU set during configuration using
12787 --with-cpu, otherwise it is "generic". */
12792 selected_cpu
= &all_cores
[selected_arch
->ident
];
12793 aarch64_isa_flags
= arch_isa
;
12794 explicit_arch
= selected_arch
->arch
;
12798 /* Get default configure-time CPU. */
12799 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
12800 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
12804 explicit_tune_core
= selected_tune
->ident
;
12806 /* If both -mcpu and -march are specified check that they are architecturally
12807 compatible, warn if they're not and prefer the -march ISA flags. */
12808 else if (selected_arch
)
12810 if (selected_arch
->arch
!= selected_cpu
->arch
)
12812 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12813 all_architectures
[selected_cpu
->arch
].name
,
12814 selected_arch
->name
);
12816 aarch64_isa_flags
= arch_isa
;
12817 explicit_arch
= selected_arch
->arch
;
12818 explicit_tune_core
= selected_tune
? selected_tune
->ident
12819 : selected_cpu
->ident
;
12823 /* -mcpu but no -march. */
12824 aarch64_isa_flags
= cpu_isa
;
12825 explicit_tune_core
= selected_tune
? selected_tune
->ident
12826 : selected_cpu
->ident
;
12827 gcc_assert (selected_cpu
);
12828 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12829 explicit_arch
= selected_arch
->arch
;
12832 /* Set the arch as well as we will need it when outputing
12833 the .arch directive in assembly. */
12834 if (!selected_arch
)
12836 gcc_assert (selected_cpu
);
12837 selected_arch
= &all_architectures
[selected_cpu
->arch
];
12840 if (!selected_tune
)
12841 selected_tune
= selected_cpu
;
12843 if (aarch64_enable_bti
== 2)
12845 #ifdef TARGET_ENABLE_BTI
12846 aarch64_enable_bti
= 1;
12848 aarch64_enable_bti
= 0;
12852 /* Return address signing is currently not supported for ILP32 targets. For
12853 LP64 targets use the configured option in the absence of a command-line
12854 option for -mbranch-protection. */
12855 if (!TARGET_ILP32
&& accepted_branch_protection_string
== NULL
)
12857 #ifdef TARGET_ENABLE_PAC_RET
12858 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NON_LEAF
;
12860 aarch64_ra_sign_scope
= AARCH64_FUNCTION_NONE
;
12864 #ifndef HAVE_AS_MABI_OPTION
12865 /* The compiler may have been configured with 2.23.* binutils, which does
12866 not have support for ILP32. */
12868 error ("assembler does not support %<-mabi=ilp32%>");
12871 /* Convert -msve-vector-bits to a VG count. */
12872 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
12874 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
12875 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12877 /* Make sure we properly set up the explicit options. */
12878 if ((aarch64_cpu_string
&& valid_cpu
)
12879 || (aarch64_tune_string
&& valid_tune
))
12880 gcc_assert (explicit_tune_core
!= aarch64_none
);
12882 if ((aarch64_cpu_string
&& valid_cpu
)
12883 || (aarch64_arch_string
&& valid_arch
))
12884 gcc_assert (explicit_arch
!= aarch64_no_arch
);
12886 /* The pass to insert speculation tracking runs before
12887 shrink-wrapping and the latter does not know how to update the
12888 tracking status. So disable it in this case. */
12889 if (aarch64_track_speculation
)
12890 flag_shrink_wrap
= 0;
12892 aarch64_override_options_internal (&global_options
);
12894 /* Save these options as the default ones in case we push and pop them later
12895 while processing functions with potential target attributes. */
12896 target_option_default_node
= target_option_current_node
12897 = build_target_option_node (&global_options
);
12900 /* Implement targetm.override_options_after_change. */
12903 aarch64_override_options_after_change (void)
12905 aarch64_override_options_after_change_1 (&global_options
);
12908 static struct machine_function
*
12909 aarch64_init_machine_status (void)
12911 struct machine_function
*machine
;
12912 machine
= ggc_cleared_alloc
<machine_function
> ();
12917 aarch64_init_expanders (void)
12919 init_machine_status
= aarch64_init_machine_status
;
12922 /* A checking mechanism for the implementation of the various code models. */
12924 initialize_aarch64_code_model (struct gcc_options
*opts
)
12926 if (opts
->x_flag_pic
)
12928 switch (opts
->x_aarch64_cmodel_var
)
12930 case AARCH64_CMODEL_TINY
:
12931 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
12933 case AARCH64_CMODEL_SMALL
:
12934 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12935 aarch64_cmodel
= (flag_pic
== 2
12936 ? AARCH64_CMODEL_SMALL_PIC
12937 : AARCH64_CMODEL_SMALL_SPIC
);
12939 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
12942 case AARCH64_CMODEL_LARGE
:
12943 sorry ("code model %qs with %<-f%s%>", "large",
12944 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
12947 gcc_unreachable ();
12951 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
12954 /* Implement TARGET_OPTION_SAVE. */
12957 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
12959 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
12960 ptr
->x_aarch64_branch_protection_string
12961 = opts
->x_aarch64_branch_protection_string
;
12964 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12965 using the information saved in PTR. */
12968 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
12970 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
12971 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12972 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
12973 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12974 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
12975 opts
->x_aarch64_branch_protection_string
12976 = ptr
->x_aarch64_branch_protection_string
;
12977 if (opts
->x_aarch64_branch_protection_string
)
12979 aarch64_parse_branch_protection (opts
->x_aarch64_branch_protection_string
,
12983 aarch64_override_options_internal (opts
);
12986 /* Implement TARGET_OPTION_PRINT. */
12989 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
12991 const struct processor
*cpu
12992 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
12993 uint64_t isa_flags
= ptr
->x_aarch64_isa_flags
;
12994 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
12995 std::string extension
12996 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
12998 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
12999 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
13000 arch
->name
, extension
.c_str ());
13003 static GTY(()) tree aarch64_previous_fndecl
;
13006 aarch64_reset_previous_fndecl (void)
13008 aarch64_previous_fndecl
= NULL
;
13011 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13012 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13013 make sure optab availability predicates are recomputed when necessary. */
13016 aarch64_save_restore_target_globals (tree new_tree
)
13018 if (TREE_TARGET_GLOBALS (new_tree
))
13019 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
13020 else if (new_tree
== target_option_default_node
)
13021 restore_target_globals (&default_target_globals
);
13023 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
13026 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13027 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13028 of the function, if such exists. This function may be called multiple
13029 times on a single function so use aarch64_previous_fndecl to avoid
13030 setting up identical state. */
13033 aarch64_set_current_function (tree fndecl
)
13035 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
13038 tree old_tree
= (aarch64_previous_fndecl
13039 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
13042 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13044 /* If current function has no attributes but the previous one did,
13045 use the default node. */
13046 if (!new_tree
&& old_tree
)
13047 new_tree
= target_option_default_node
;
13049 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13050 the default have been handled by aarch64_save_restore_target_globals from
13051 aarch64_pragma_target_parse. */
13052 if (old_tree
== new_tree
)
13055 aarch64_previous_fndecl
= fndecl
;
13057 /* First set the target options. */
13058 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
13060 aarch64_save_restore_target_globals (new_tree
);
13063 /* Enum describing the various ways we can handle attributes.
13064 In many cases we can reuse the generic option handling machinery. */
13066 enum aarch64_attr_opt_type
13068 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
13069 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
13070 aarch64_attr_enum
, /* Attribute sets an enum variable. */
13071 aarch64_attr_custom
/* Attribute requires a custom handling function. */
13074 /* All the information needed to handle a target attribute.
13075 NAME is the name of the attribute.
13076 ATTR_TYPE specifies the type of behavior of the attribute as described
13077 in the definition of enum aarch64_attr_opt_type.
13078 ALLOW_NEG is true if the attribute supports a "no-" form.
13079 HANDLER is the function that takes the attribute string as an argument
13080 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13081 OPT_NUM is the enum specifying the option that the attribute modifies.
13082 This is needed for attributes that mirror the behavior of a command-line
13083 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13084 aarch64_attr_enum. */
13086 struct aarch64_attribute_info
13089 enum aarch64_attr_opt_type attr_type
;
13091 bool (*handler
) (const char *);
13092 enum opt_code opt_num
;
13095 /* Handle the ARCH_STR argument to the arch= target attribute. */
13098 aarch64_handle_attr_arch (const char *str
)
13100 const struct processor
*tmp_arch
= NULL
;
13101 std::string invalid_extension
;
13102 enum aarch64_parse_opt_result parse_res
13103 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
, &invalid_extension
);
13105 if (parse_res
== AARCH64_PARSE_OK
)
13107 gcc_assert (tmp_arch
);
13108 selected_arch
= tmp_arch
;
13109 explicit_arch
= selected_arch
->arch
;
13115 case AARCH64_PARSE_MISSING_ARG
:
13116 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13118 case AARCH64_PARSE_INVALID_ARG
:
13119 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
13120 aarch64_print_hint_for_arch (str
);
13122 case AARCH64_PARSE_INVALID_FEATURE
:
13123 error ("invalid feature modifier %s of value (\"%s\") in "
13124 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13125 aarch64_print_hint_for_extensions (invalid_extension
);
13128 gcc_unreachable ();
13134 /* Handle the argument CPU_STR to the cpu= target attribute. */
13137 aarch64_handle_attr_cpu (const char *str
)
13139 const struct processor
*tmp_cpu
= NULL
;
13140 std::string invalid_extension
;
13141 enum aarch64_parse_opt_result parse_res
13142 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
, &invalid_extension
);
13144 if (parse_res
== AARCH64_PARSE_OK
)
13146 gcc_assert (tmp_cpu
);
13147 selected_tune
= tmp_cpu
;
13148 explicit_tune_core
= selected_tune
->ident
;
13150 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
13151 explicit_arch
= selected_arch
->arch
;
13157 case AARCH64_PARSE_MISSING_ARG
:
13158 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13160 case AARCH64_PARSE_INVALID_ARG
:
13161 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
13162 aarch64_print_hint_for_core (str
);
13164 case AARCH64_PARSE_INVALID_FEATURE
:
13165 error ("invalid feature modifier %s of value (\"%s\") in "
13166 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13167 aarch64_print_hint_for_extensions (invalid_extension
);
13170 gcc_unreachable ();
13176 /* Handle the argument STR to the branch-protection= attribute. */
13179 aarch64_handle_attr_branch_protection (const char* str
)
13181 char *err_str
= (char *) xmalloc (strlen (str
));
13182 enum aarch64_parse_opt_result res
= aarch64_parse_branch_protection (str
,
13184 bool success
= false;
13187 case AARCH64_PARSE_MISSING_ARG
:
13188 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13191 case AARCH64_PARSE_INVALID_ARG
:
13192 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13193 "=\")%> pragma or attribute", err_str
);
13195 case AARCH64_PARSE_OK
:
13197 /* Fall through. */
13198 case AARCH64_PARSE_INVALID_FEATURE
:
13201 gcc_unreachable ();
13207 /* Handle the argument STR to the tune= target attribute. */
13210 aarch64_handle_attr_tune (const char *str
)
13212 const struct processor
*tmp_tune
= NULL
;
13213 enum aarch64_parse_opt_result parse_res
13214 = aarch64_parse_tune (str
, &tmp_tune
);
13216 if (parse_res
== AARCH64_PARSE_OK
)
13218 gcc_assert (tmp_tune
);
13219 selected_tune
= tmp_tune
;
13220 explicit_tune_core
= selected_tune
->ident
;
13226 case AARCH64_PARSE_INVALID_ARG
:
13227 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
13228 aarch64_print_hint_for_core (str
);
13231 gcc_unreachable ();
13237 /* Parse an architecture extensions target attribute string specified in STR.
13238 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13239 if successful. Update aarch64_isa_flags to reflect the ISA features
13243 aarch64_handle_attr_isa_flags (char *str
)
13245 enum aarch64_parse_opt_result parse_res
;
13246 uint64_t isa_flags
= aarch64_isa_flags
;
13248 /* We allow "+nothing" in the beginning to clear out all architectural
13249 features if the user wants to handpick specific features. */
13250 if (strncmp ("+nothing", str
, 8) == 0)
13256 std::string invalid_extension
;
13257 parse_res
= aarch64_parse_extension (str
, &isa_flags
, &invalid_extension
);
13259 if (parse_res
== AARCH64_PARSE_OK
)
13261 aarch64_isa_flags
= isa_flags
;
13267 case AARCH64_PARSE_MISSING_ARG
:
13268 error ("missing value in %<target()%> pragma or attribute");
13271 case AARCH64_PARSE_INVALID_FEATURE
:
13272 error ("invalid feature modifier %s of value (\"%s\") in "
13273 "%<target()%> pragma or attribute", invalid_extension
.c_str (), str
);
13277 gcc_unreachable ();
13283 /* The target attributes that we support. On top of these we also support just
13284 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13285 handled explicitly in aarch64_process_one_target_attr. */
13287 static const struct aarch64_attribute_info aarch64_attributes
[] =
13289 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
13290 OPT_mgeneral_regs_only
},
13291 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
13292 OPT_mfix_cortex_a53_835769
},
13293 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
13294 OPT_mfix_cortex_a53_843419
},
13295 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
13296 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
13297 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
13298 OPT_momit_leaf_frame_pointer
},
13299 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
13300 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
13302 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
13303 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
13305 { "branch-protection", aarch64_attr_custom
, false,
13306 aarch64_handle_attr_branch_protection
, OPT_mbranch_protection_
},
13307 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
13308 OPT_msign_return_address_
},
13309 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
13312 /* Parse ARG_STR which contains the definition of one target attribute.
13313 Show appropriate errors if any or return true if the attribute is valid. */
13316 aarch64_process_one_target_attr (char *arg_str
)
13318 bool invert
= false;
13320 size_t len
= strlen (arg_str
);
13324 error ("malformed %<target()%> pragma or attribute");
13328 char *str_to_check
= (char *) alloca (len
+ 1);
13329 strcpy (str_to_check
, arg_str
);
13331 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13332 It is easier to detect and handle it explicitly here rather than going
13333 through the machinery for the rest of the target attributes in this
13335 if (*str_to_check
== '+')
13336 return aarch64_handle_attr_isa_flags (str_to_check
);
13338 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
13343 char *arg
= strchr (str_to_check
, '=');
13345 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13346 and point ARG to "foo". */
13352 const struct aarch64_attribute_info
*p_attr
;
13353 bool found
= false;
13354 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
13356 /* If the names don't match up, or the user has given an argument
13357 to an attribute that doesn't accept one, or didn't give an argument
13358 to an attribute that expects one, fail to match. */
13359 if (strcmp (str_to_check
, p_attr
->name
) != 0)
13363 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
13364 || p_attr
->attr_type
== aarch64_attr_enum
;
13366 if (attr_need_arg_p
^ (arg
!= NULL
))
13368 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
13372 /* If the name matches but the attribute does not allow "no-" versions
13373 then we can't match. */
13374 if (invert
&& !p_attr
->allow_neg
)
13376 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
13380 switch (p_attr
->attr_type
)
13382 /* Has a custom handler registered.
13383 For example, cpu=, arch=, tune=. */
13384 case aarch64_attr_custom
:
13385 gcc_assert (p_attr
->handler
);
13386 if (!p_attr
->handler (arg
))
13390 /* Either set or unset a boolean option. */
13391 case aarch64_attr_bool
:
13393 struct cl_decoded_option decoded
;
13395 generate_option (p_attr
->opt_num
, NULL
, !invert
,
13396 CL_TARGET
, &decoded
);
13397 aarch64_handle_option (&global_options
, &global_options_set
,
13398 &decoded
, input_location
);
13401 /* Set or unset a bit in the target_flags. aarch64_handle_option
13402 should know what mask to apply given the option number. */
13403 case aarch64_attr_mask
:
13405 struct cl_decoded_option decoded
;
13406 /* We only need to specify the option number.
13407 aarch64_handle_option will know which mask to apply. */
13408 decoded
.opt_index
= p_attr
->opt_num
;
13409 decoded
.value
= !invert
;
13410 aarch64_handle_option (&global_options
, &global_options_set
,
13411 &decoded
, input_location
);
13414 /* Use the option setting machinery to set an option to an enum. */
13415 case aarch64_attr_enum
:
13420 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
13421 &value
, CL_TARGET
);
13424 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
13425 NULL
, DK_UNSPECIFIED
, input_location
,
13430 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
13435 gcc_unreachable ();
13439 /* If we reached here we either have found an attribute and validated
13440 it or didn't match any. If we matched an attribute but its arguments
13441 were malformed we will have returned false already. */
13445 /* Count how many times the character C appears in
13446 NULL-terminated string STR. */
13448 static unsigned int
13449 num_occurences_in_str (char c
, char *str
)
13451 unsigned int res
= 0;
13452 while (*str
!= '\0')
13463 /* Parse the tree in ARGS that contains the target attribute information
13464 and update the global target options space. */
13467 aarch64_process_target_attr (tree args
)
13469 if (TREE_CODE (args
) == TREE_LIST
)
13473 tree head
= TREE_VALUE (args
);
13476 if (!aarch64_process_target_attr (head
))
13479 args
= TREE_CHAIN (args
);
13485 if (TREE_CODE (args
) != STRING_CST
)
13487 error ("attribute %<target%> argument not a string");
13491 size_t len
= strlen (TREE_STRING_POINTER (args
));
13492 char *str_to_check
= (char *) alloca (len
+ 1);
13493 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
13497 error ("malformed %<target()%> pragma or attribute");
13501 /* Used to catch empty spaces between commas i.e.
13502 attribute ((target ("attr1,,attr2"))). */
13503 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
13505 /* Handle multiple target attributes separated by ','. */
13506 char *token
= strtok_r (str_to_check
, ",", &str_to_check
);
13508 unsigned int num_attrs
= 0;
13512 if (!aarch64_process_one_target_attr (token
))
13514 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
13518 token
= strtok_r (NULL
, ",", &str_to_check
);
13521 if (num_attrs
!= num_commas
+ 1)
13523 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
13530 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13531 process attribute ((target ("..."))). */
13534 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
13536 struct cl_target_option cur_target
;
13539 tree new_target
, new_optimize
;
13540 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13542 /* If what we're processing is the current pragma string then the
13543 target option node is already stored in target_option_current_node
13544 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13545 having to re-parse the string. This is especially useful to keep
13546 arm_neon.h compile times down since that header contains a lot
13547 of intrinsics enclosed in pragmas. */
13548 if (!existing_target
&& args
== current_target_pragma
)
13550 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
13553 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13555 old_optimize
= build_optimization_node (&global_options
);
13556 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
13558 /* If the function changed the optimization levels as well as setting
13559 target options, start with the optimizations specified. */
13560 if (func_optimize
&& func_optimize
!= old_optimize
)
13561 cl_optimization_restore (&global_options
,
13562 TREE_OPTIMIZATION (func_optimize
));
13564 /* Save the current target options to restore at the end. */
13565 cl_target_option_save (&cur_target
, &global_options
);
13567 /* If fndecl already has some target attributes applied to it, unpack
13568 them so that we add this attribute on top of them, rather than
13569 overwriting them. */
13570 if (existing_target
)
13572 struct cl_target_option
*existing_options
13573 = TREE_TARGET_OPTION (existing_target
);
13575 if (existing_options
)
13576 cl_target_option_restore (&global_options
, existing_options
);
13579 cl_target_option_restore (&global_options
,
13580 TREE_TARGET_OPTION (target_option_current_node
));
13582 ret
= aarch64_process_target_attr (args
);
13584 /* Set up any additional state. */
13587 aarch64_override_options_internal (&global_options
);
13588 /* Initialize SIMD builtins if we haven't already.
13589 Set current_target_pragma to NULL for the duration so that
13590 the builtin initialization code doesn't try to tag the functions
13591 being built with the attributes specified by any current pragma, thus
13592 going into an infinite recursion. */
13595 tree saved_current_target_pragma
= current_target_pragma
;
13596 current_target_pragma
= NULL
;
13597 aarch64_init_simd_builtins ();
13598 current_target_pragma
= saved_current_target_pragma
;
13600 new_target
= build_target_option_node (&global_options
);
13605 new_optimize
= build_optimization_node (&global_options
);
13609 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
13611 if (old_optimize
!= new_optimize
)
13612 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
13615 cl_target_option_restore (&global_options
, &cur_target
);
13617 if (old_optimize
!= new_optimize
)
13618 cl_optimization_restore (&global_options
,
13619 TREE_OPTIMIZATION (old_optimize
));
13623 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13624 tri-bool options (yes, no, don't care) and the default value is
13625 DEF, determine whether to reject inlining. */
13628 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
13629 int dont_care
, int def
)
13631 /* If the callee doesn't care, always allow inlining. */
13632 if (callee
== dont_care
)
13635 /* If the caller doesn't care, always allow inlining. */
13636 if (caller
== dont_care
)
13639 /* Otherwise, allow inlining if either the callee and caller values
13640 agree, or if the callee is using the default value. */
13641 return (callee
== caller
|| callee
== def
);
13644 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13645 to inline CALLEE into CALLER based on target-specific info.
13646 Make sure that the caller and callee have compatible architectural
13647 features. Then go through the other possible target attributes
13648 and see if they can block inlining. Try not to reject always_inline
13649 callees unless they are incompatible architecturally. */
13652 aarch64_can_inline_p (tree caller
, tree callee
)
13654 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
13655 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
13657 struct cl_target_option
*caller_opts
13658 = TREE_TARGET_OPTION (caller_tree
? caller_tree
13659 : target_option_default_node
);
13661 struct cl_target_option
*callee_opts
13662 = TREE_TARGET_OPTION (callee_tree
? callee_tree
13663 : target_option_default_node
);
13665 /* Callee's ISA flags should be a subset of the caller's. */
13666 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
13667 != callee_opts
->x_aarch64_isa_flags
)
13670 /* Allow non-strict aligned functions inlining into strict
13672 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
13673 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
13674 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
13675 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
13678 bool always_inline
= lookup_attribute ("always_inline",
13679 DECL_ATTRIBUTES (callee
));
13681 /* If the architectural features match up and the callee is always_inline
13682 then the other attributes don't matter. */
13686 if (caller_opts
->x_aarch64_cmodel_var
13687 != callee_opts
->x_aarch64_cmodel_var
)
13690 if (caller_opts
->x_aarch64_tls_dialect
13691 != callee_opts
->x_aarch64_tls_dialect
)
13694 /* Honour explicit requests to workaround errata. */
13695 if (!aarch64_tribools_ok_for_inlining_p (
13696 caller_opts
->x_aarch64_fix_a53_err835769
,
13697 callee_opts
->x_aarch64_fix_a53_err835769
,
13698 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
13701 if (!aarch64_tribools_ok_for_inlining_p (
13702 caller_opts
->x_aarch64_fix_a53_err843419
,
13703 callee_opts
->x_aarch64_fix_a53_err843419
,
13704 2, TARGET_FIX_ERR_A53_843419
))
13707 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13708 caller and calle and they don't match up, reject inlining. */
13709 if (!aarch64_tribools_ok_for_inlining_p (
13710 caller_opts
->x_flag_omit_leaf_frame_pointer
,
13711 callee_opts
->x_flag_omit_leaf_frame_pointer
,
13715 /* If the callee has specific tuning overrides, respect them. */
13716 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
13717 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
13720 /* If the user specified tuning override strings for the
13721 caller and callee and they don't match up, reject inlining.
13722 We just do a string compare here, we don't analyze the meaning
13723 of the string, as it would be too costly for little gain. */
13724 if (callee_opts
->x_aarch64_override_tune_string
13725 && caller_opts
->x_aarch64_override_tune_string
13726 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
13727 caller_opts
->x_aarch64_override_tune_string
) != 0))
13733 /* Return true if SYMBOL_REF X binds locally. */
13736 aarch64_symbol_binds_local_p (const_rtx x
)
13738 return (SYMBOL_REF_DECL (x
)
13739 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
13740 : SYMBOL_REF_LOCAL_P (x
));
13743 /* Return true if SYMBOL_REF X is thread local */
13745 aarch64_tls_symbol_p (rtx x
)
13747 if (! TARGET_HAVE_TLS
)
13750 if (GET_CODE (x
) != SYMBOL_REF
)
13753 return SYMBOL_REF_TLS_MODEL (x
) != 0;
13756 /* Classify a TLS symbol into one of the TLS kinds. */
13757 enum aarch64_symbol_type
13758 aarch64_classify_tls_symbol (rtx x
)
13760 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
13764 case TLS_MODEL_GLOBAL_DYNAMIC
:
13765 case TLS_MODEL_LOCAL_DYNAMIC
:
13766 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
13768 case TLS_MODEL_INITIAL_EXEC
:
13769 switch (aarch64_cmodel
)
13771 case AARCH64_CMODEL_TINY
:
13772 case AARCH64_CMODEL_TINY_PIC
:
13773 return SYMBOL_TINY_TLSIE
;
13775 return SYMBOL_SMALL_TLSIE
;
13778 case TLS_MODEL_LOCAL_EXEC
:
13779 if (aarch64_tls_size
== 12)
13780 return SYMBOL_TLSLE12
;
13781 else if (aarch64_tls_size
== 24)
13782 return SYMBOL_TLSLE24
;
13783 else if (aarch64_tls_size
== 32)
13784 return SYMBOL_TLSLE32
;
13785 else if (aarch64_tls_size
== 48)
13786 return SYMBOL_TLSLE48
;
13788 gcc_unreachable ();
13790 case TLS_MODEL_EMULATED
:
13791 case TLS_MODEL_NONE
:
13792 return SYMBOL_FORCE_TO_MEM
;
13795 gcc_unreachable ();
13799 /* Return the correct method for accessing X + OFFSET, where X is either
13800 a SYMBOL_REF or LABEL_REF. */
13802 enum aarch64_symbol_type
13803 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
13805 if (GET_CODE (x
) == LABEL_REF
)
13807 switch (aarch64_cmodel
)
13809 case AARCH64_CMODEL_LARGE
:
13810 return SYMBOL_FORCE_TO_MEM
;
13812 case AARCH64_CMODEL_TINY_PIC
:
13813 case AARCH64_CMODEL_TINY
:
13814 return SYMBOL_TINY_ABSOLUTE
;
13816 case AARCH64_CMODEL_SMALL_SPIC
:
13817 case AARCH64_CMODEL_SMALL_PIC
:
13818 case AARCH64_CMODEL_SMALL
:
13819 return SYMBOL_SMALL_ABSOLUTE
;
13822 gcc_unreachable ();
13826 if (GET_CODE (x
) == SYMBOL_REF
)
13828 if (aarch64_tls_symbol_p (x
))
13829 return aarch64_classify_tls_symbol (x
);
13831 switch (aarch64_cmodel
)
13833 case AARCH64_CMODEL_TINY
:
13834 /* When we retrieve symbol + offset address, we have to make sure
13835 the offset does not cause overflow of the final address. But
13836 we have no way of knowing the address of symbol at compile time
13837 so we can't accurately say if the distance between the PC and
13838 symbol + offset is outside the addressible range of +/-1M in the
13839 TINY code model. So we rely on images not being greater than
13840 1M and cap the offset at 1M and anything beyond 1M will have to
13841 be loaded using an alternative mechanism. Furthermore if the
13842 symbol is a weak reference to something that isn't known to
13843 resolve to a symbol in this module, then force to memory. */
13844 if ((SYMBOL_REF_WEAK (x
)
13845 && !aarch64_symbol_binds_local_p (x
))
13846 || !IN_RANGE (offset
, -1048575, 1048575))
13847 return SYMBOL_FORCE_TO_MEM
;
13848 return SYMBOL_TINY_ABSOLUTE
;
13850 case AARCH64_CMODEL_SMALL
:
13851 /* Same reasoning as the tiny code model, but the offset cap here is
13853 if ((SYMBOL_REF_WEAK (x
)
13854 && !aarch64_symbol_binds_local_p (x
))
13855 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
13856 HOST_WIDE_INT_C (4294967264)))
13857 return SYMBOL_FORCE_TO_MEM
;
13858 return SYMBOL_SMALL_ABSOLUTE
;
13860 case AARCH64_CMODEL_TINY_PIC
:
13861 if (!aarch64_symbol_binds_local_p (x
))
13862 return SYMBOL_TINY_GOT
;
13863 return SYMBOL_TINY_ABSOLUTE
;
13865 case AARCH64_CMODEL_SMALL_SPIC
:
13866 case AARCH64_CMODEL_SMALL_PIC
:
13867 if (!aarch64_symbol_binds_local_p (x
))
13868 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
13869 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
13870 return SYMBOL_SMALL_ABSOLUTE
;
13872 case AARCH64_CMODEL_LARGE
:
13873 /* This is alright even in PIC code as the constant
13874 pool reference is always PC relative and within
13875 the same translation unit. */
13876 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
13877 return SYMBOL_SMALL_ABSOLUTE
;
13879 return SYMBOL_FORCE_TO_MEM
;
13882 gcc_unreachable ();
13886 /* By default push everything into the constant pool. */
13887 return SYMBOL_FORCE_TO_MEM
;
13891 aarch64_constant_address_p (rtx x
)
13893 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
13897 aarch64_legitimate_pic_operand_p (rtx x
)
13899 if (GET_CODE (x
) == SYMBOL_REF
13900 || (GET_CODE (x
) == CONST
13901 && GET_CODE (XEXP (x
, 0)) == PLUS
13902 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
13908 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13909 that should be rematerialized rather than spilled. */
13912 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
13914 /* Support CSE and rematerialization of common constants. */
13915 if (CONST_INT_P (x
)
13916 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
13917 || GET_CODE (x
) == CONST_VECTOR
)
13920 /* Do not allow vector struct mode constants for Advanced SIMD.
13921 We could support 0 and -1 easily, but they need support in
13922 aarch64-simd.md. */
13923 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13924 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13927 /* Only accept variable-length vector constants if they can be
13930 ??? It would be possible to handle rematerialization of other
13931 constants via secondary reloads. */
13932 if (vec_flags
& VEC_ANY_SVE
)
13933 return aarch64_simd_valid_immediate (x
, NULL
);
13935 if (GET_CODE (x
) == HIGH
)
13938 /* Accept polynomial constants that can be calculated by using the
13939 destination of a move as the sole temporary. Constants that
13940 require a second temporary cannot be rematerialized (they can't be
13941 forced to memory and also aren't legitimate constants). */
13943 if (poly_int_rtx_p (x
, &offset
))
13944 return aarch64_offset_temporaries (false, offset
) <= 1;
13946 /* If an offset is being added to something else, we need to allow the
13947 base to be moved into the destination register, meaning that there
13948 are no free temporaries for the offset. */
13949 x
= strip_offset (x
, &offset
);
13950 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
13953 /* Do not allow const (plus (anchor_symbol, const_int)). */
13954 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
13957 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13958 so spilling them is better than rematerialization. */
13959 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
13962 /* Label references are always constant. */
13963 if (GET_CODE (x
) == LABEL_REF
)
13970 aarch64_load_tp (rtx target
)
13973 || GET_MODE (target
) != Pmode
13974 || !register_operand (target
, Pmode
))
13975 target
= gen_reg_rtx (Pmode
);
13977 /* Can return in any reg. */
13978 emit_insn (gen_aarch64_load_tp_hard (target
));
13982 /* On AAPCS systems, this is the "struct __va_list". */
13983 static GTY(()) tree va_list_type
;
13985 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13986 Return the type to use as __builtin_va_list.
13988 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14000 aarch64_build_builtin_va_list (void)
14003 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14005 /* Create the type. */
14006 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
14007 /* Give it the required name. */
14008 va_list_name
= build_decl (BUILTINS_LOCATION
,
14010 get_identifier ("__va_list"),
14012 DECL_ARTIFICIAL (va_list_name
) = 1;
14013 TYPE_NAME (va_list_type
) = va_list_name
;
14014 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
14016 /* Create the fields. */
14017 f_stack
= build_decl (BUILTINS_LOCATION
,
14018 FIELD_DECL
, get_identifier ("__stack"),
14020 f_grtop
= build_decl (BUILTINS_LOCATION
,
14021 FIELD_DECL
, get_identifier ("__gr_top"),
14023 f_vrtop
= build_decl (BUILTINS_LOCATION
,
14024 FIELD_DECL
, get_identifier ("__vr_top"),
14026 f_groff
= build_decl (BUILTINS_LOCATION
,
14027 FIELD_DECL
, get_identifier ("__gr_offs"),
14028 integer_type_node
);
14029 f_vroff
= build_decl (BUILTINS_LOCATION
,
14030 FIELD_DECL
, get_identifier ("__vr_offs"),
14031 integer_type_node
);
14033 /* Tell tree-stdarg pass about our internal offset fields.
14034 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14035 purpose to identify whether the code is updating va_list internal
14036 offset fields through irregular way. */
14037 va_list_gpr_counter_field
= f_groff
;
14038 va_list_fpr_counter_field
= f_vroff
;
14040 DECL_ARTIFICIAL (f_stack
) = 1;
14041 DECL_ARTIFICIAL (f_grtop
) = 1;
14042 DECL_ARTIFICIAL (f_vrtop
) = 1;
14043 DECL_ARTIFICIAL (f_groff
) = 1;
14044 DECL_ARTIFICIAL (f_vroff
) = 1;
14046 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
14047 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
14048 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
14049 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
14050 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
14052 TYPE_FIELDS (va_list_type
) = f_stack
;
14053 DECL_CHAIN (f_stack
) = f_grtop
;
14054 DECL_CHAIN (f_grtop
) = f_vrtop
;
14055 DECL_CHAIN (f_vrtop
) = f_groff
;
14056 DECL_CHAIN (f_groff
) = f_vroff
;
14058 /* Compute its layout. */
14059 layout_type (va_list_type
);
14061 return va_list_type
;
14064 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14066 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
14068 const CUMULATIVE_ARGS
*cum
;
14069 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14070 tree stack
, grtop
, vrtop
, groff
, vroff
;
14072 int gr_save_area_size
= cfun
->va_list_gpr_size
;
14073 int vr_save_area_size
= cfun
->va_list_fpr_size
;
14076 cum
= &crtl
->args
.info
;
14077 if (cfun
->va_list_gpr_size
)
14078 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
14079 cfun
->va_list_gpr_size
);
14080 if (cfun
->va_list_fpr_size
)
14081 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
14082 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
14086 gcc_assert (cum
->aapcs_nvrn
== 0);
14087 vr_save_area_size
= 0;
14090 f_stack
= TYPE_FIELDS (va_list_type_node
);
14091 f_grtop
= DECL_CHAIN (f_stack
);
14092 f_vrtop
= DECL_CHAIN (f_grtop
);
14093 f_groff
= DECL_CHAIN (f_vrtop
);
14094 f_vroff
= DECL_CHAIN (f_groff
);
14096 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
14098 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
14100 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
14102 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
14104 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
14107 /* Emit code to initialize STACK, which points to the next varargs stack
14108 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14109 by named arguments. STACK is 8-byte aligned. */
14110 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
14111 if (cum
->aapcs_stack_size
> 0)
14112 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
14113 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
14114 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14116 /* Emit code to initialize GRTOP, the top of the GR save area.
14117 virtual_incoming_args_rtx should have been 16 byte aligned. */
14118 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
14119 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
14120 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14122 /* Emit code to initialize VRTOP, the top of the VR save area.
14123 This address is gr_save_area_bytes below GRTOP, rounded
14124 down to the next 16-byte boundary. */
14125 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
14126 vr_offset
= ROUND_UP (gr_save_area_size
,
14127 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14130 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
14131 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
14132 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14134 /* Emit code to initialize GROFF, the offset from GRTOP of the
14135 next GPR argument. */
14136 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
14137 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
14138 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14140 /* Likewise emit code to initialize VROFF, the offset from FTOP
14141 of the next VR argument. */
14142 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
14143 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
14144 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
14147 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14150 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
14151 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
14155 bool is_ha
; /* is HFA or HVA. */
14156 bool dw_align
; /* double-word align. */
14157 machine_mode ag_mode
= VOIDmode
;
14161 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
14162 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
14163 HOST_WIDE_INT size
, rsize
, adjust
, align
;
14164 tree t
, u
, cond1
, cond2
;
14166 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
14168 type
= build_pointer_type (type
);
14170 mode
= TYPE_MODE (type
);
14172 f_stack
= TYPE_FIELDS (va_list_type_node
);
14173 f_grtop
= DECL_CHAIN (f_stack
);
14174 f_vrtop
= DECL_CHAIN (f_grtop
);
14175 f_groff
= DECL_CHAIN (f_vrtop
);
14176 f_vroff
= DECL_CHAIN (f_groff
);
14178 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
14179 f_stack
, NULL_TREE
);
14180 size
= int_size_in_bytes (type
);
14184 = aarch64_function_arg_alignment (mode
, type
, &abi_break
) / BITS_PER_UNIT
;
14188 if (aarch64_vfp_is_call_or_return_candidate (mode
,
14194 /* No frontends can create types with variable-sized modes, so we
14195 shouldn't be asked to pass or return them. */
14196 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
14198 /* TYPE passed in fp/simd registers. */
14200 aarch64_err_no_fpadvsimd (mode
);
14202 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
14203 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
14204 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
14205 unshare_expr (valist
), f_vroff
, NULL_TREE
);
14207 rsize
= nregs
* UNITS_PER_VREG
;
14211 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
14212 adjust
= UNITS_PER_VREG
- ag_size
;
14214 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14215 && size
< UNITS_PER_VREG
)
14217 adjust
= UNITS_PER_VREG
- size
;
14222 /* TYPE passed in general registers. */
14223 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
14224 unshare_expr (valist
), f_grtop
, NULL_TREE
);
14225 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
14226 unshare_expr (valist
), f_groff
, NULL_TREE
);
14227 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
14228 nregs
= rsize
/ UNITS_PER_WORD
;
14232 if (abi_break
&& warn_psabi
)
14233 inform (input_location
, "parameter passing for argument of type "
14234 "%qT changed in GCC 9.1", type
);
14238 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14239 && size
< UNITS_PER_WORD
)
14241 adjust
= UNITS_PER_WORD
- size
;
14245 /* Get a local temporary for the field value. */
14246 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
14248 /* Emit code to branch if off >= 0. */
14249 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
14250 build_int_cst (TREE_TYPE (off
), 0));
14251 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
14255 /* Emit: offs = (offs + 15) & -16. */
14256 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14257 build_int_cst (TREE_TYPE (off
), 15));
14258 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
14259 build_int_cst (TREE_TYPE (off
), -16));
14260 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
14265 /* Update ap.__[g|v]r_offs */
14266 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
14267 build_int_cst (TREE_TYPE (off
), rsize
));
14268 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
14272 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14274 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14275 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
14276 build_int_cst (TREE_TYPE (f_off
), 0));
14277 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
14279 /* String up: make sure the assignment happens before the use. */
14280 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
14281 COND_EXPR_ELSE (cond1
) = t
;
14283 /* Prepare the trees handling the argument that is passed on the stack;
14284 the top level node will store in ON_STACK. */
14285 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
14288 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14289 t
= fold_build_pointer_plus_hwi (arg
, 15);
14290 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14291 build_int_cst (TREE_TYPE (t
), -16));
14292 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
14296 /* Advance ap.__stack */
14297 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
14298 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
14299 build_int_cst (TREE_TYPE (t
), -8));
14300 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
14301 /* String up roundup and advance. */
14303 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
14304 /* String up with arg */
14305 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
14306 /* Big-endianness related address adjustment. */
14307 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
14308 && size
< UNITS_PER_WORD
)
14310 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
14311 size_int (UNITS_PER_WORD
- size
));
14312 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
14315 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
14316 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
14318 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14321 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
14322 build_int_cst (TREE_TYPE (off
), adjust
));
14324 t
= fold_convert (sizetype
, t
);
14325 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
14329 /* type ha; // treat as "struct {ftype field[n];}"
14330 ... [computing offs]
14331 for (i = 0; i <nregs; ++i, offs += 16)
14332 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14335 tree tmp_ha
, field_t
, field_ptr_t
;
14337 /* Declare a local variable. */
14338 tmp_ha
= create_tmp_var_raw (type
, "ha");
14339 gimple_add_tmp_var (tmp_ha
);
14341 /* Establish the base type. */
14345 field_t
= float_type_node
;
14346 field_ptr_t
= float_ptr_type_node
;
14349 field_t
= double_type_node
;
14350 field_ptr_t
= double_ptr_type_node
;
14353 field_t
= long_double_type_node
;
14354 field_ptr_t
= long_double_ptr_type_node
;
14357 field_t
= aarch64_fp16_type_node
;
14358 field_ptr_t
= aarch64_fp16_ptr_type_node
;
14363 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
14364 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
14365 field_ptr_t
= build_pointer_type (field_t
);
14372 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14373 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
14375 t
= fold_convert (field_ptr_t
, addr
);
14376 t
= build2 (MODIFY_EXPR
, field_t
,
14377 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
14378 build1 (INDIRECT_REF
, field_t
, t
));
14380 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14381 for (i
= 1; i
< nregs
; ++i
)
14383 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
14384 u
= fold_convert (field_ptr_t
, addr
);
14385 u
= build2 (MODIFY_EXPR
, field_t
,
14386 build2 (MEM_REF
, field_t
, tmp_ha
,
14387 build_int_cst (field_ptr_t
,
14389 int_size_in_bytes (field_t
)))),
14390 build1 (INDIRECT_REF
, field_t
, u
));
14391 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
14394 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
14395 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
14398 COND_EXPR_ELSE (cond2
) = t
;
14399 addr
= fold_convert (build_pointer_type (type
), cond1
);
14400 addr
= build_va_arg_indirect_ref (addr
);
14403 addr
= build_va_arg_indirect_ref (addr
);
14408 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14411 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
14412 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
14415 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
14416 CUMULATIVE_ARGS local_cum
;
14417 int gr_saved
= cfun
->va_list_gpr_size
;
14418 int vr_saved
= cfun
->va_list_fpr_size
;
14420 /* The caller has advanced CUM up to, but not beyond, the last named
14421 argument. Advance a local copy of CUM past the last "real" named
14422 argument, to find out how many registers are left over. */
14424 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
14426 /* Found out how many registers we need to save.
14427 Honor tree-stdvar analysis results. */
14428 if (cfun
->va_list_gpr_size
)
14429 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
14430 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
14431 if (cfun
->va_list_fpr_size
)
14432 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
14433 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
14437 gcc_assert (local_cum
.aapcs_nvrn
== 0);
14447 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14448 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
14449 - gr_saved
* UNITS_PER_WORD
);
14450 mem
= gen_frame_mem (BLKmode
, ptr
);
14451 set_mem_alias_set (mem
, get_varargs_alias_set ());
14453 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
14458 /* We can't use move_block_from_reg, because it will use
14459 the wrong mode, storing D regs only. */
14460 machine_mode mode
= TImode
;
14461 int off
, i
, vr_start
;
14463 /* Set OFF to the offset from virtual_incoming_args_rtx of
14464 the first vector register. The VR save area lies below
14465 the GR one, and is aligned to 16 bytes. */
14466 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14467 STACK_BOUNDARY
/ BITS_PER_UNIT
);
14468 off
-= vr_saved
* UNITS_PER_VREG
;
14470 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
14471 for (i
= 0; i
< vr_saved
; ++i
)
14475 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
14476 mem
= gen_frame_mem (mode
, ptr
);
14477 set_mem_alias_set (mem
, get_varargs_alias_set ());
14478 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
14479 off
+= UNITS_PER_VREG
;
14484 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14485 any complication of having crtl->args.pretend_args_size changed. */
14486 cfun
->machine
->frame
.saved_varargs_size
14487 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
14488 STACK_BOUNDARY
/ BITS_PER_UNIT
)
14489 + vr_saved
* UNITS_PER_VREG
);
14493 aarch64_conditional_register_usage (void)
14498 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
14501 call_used_regs
[i
] = 1;
14505 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
14508 call_used_regs
[i
] = 1;
14511 /* When tracking speculation, we need a couple of call-clobbered registers
14512 to track the speculation state. It would be nice to just use
14513 IP0 and IP1, but currently there are numerous places that just
14514 assume these registers are free for other uses (eg pointer
14515 authentication). */
14516 if (aarch64_track_speculation
)
14518 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14519 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
14520 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14521 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
14525 /* Walk down the type tree of TYPE counting consecutive base elements.
14526 If *MODEP is VOIDmode, then set it to the first valid floating point
14527 type. If a non-floating point type is found, or if a floating point
14528 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14529 otherwise return the count in the sub-tree. */
14531 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
14534 HOST_WIDE_INT size
;
14536 switch (TREE_CODE (type
))
14539 mode
= TYPE_MODE (type
);
14540 if (mode
!= DFmode
&& mode
!= SFmode
14541 && mode
!= TFmode
&& mode
!= HFmode
)
14544 if (*modep
== VOIDmode
)
14547 if (*modep
== mode
)
14553 mode
= TYPE_MODE (TREE_TYPE (type
));
14554 if (mode
!= DFmode
&& mode
!= SFmode
14555 && mode
!= TFmode
&& mode
!= HFmode
)
14558 if (*modep
== VOIDmode
)
14561 if (*modep
== mode
)
14567 /* Use V2SImode and V4SImode as representatives of all 64-bit
14568 and 128-bit vector types. */
14569 size
= int_size_in_bytes (type
);
14582 if (*modep
== VOIDmode
)
14585 /* Vector modes are considered to be opaque: two vectors are
14586 equivalent for the purposes of being homogeneous aggregates
14587 if they are the same size. */
14588 if (*modep
== mode
)
14596 tree index
= TYPE_DOMAIN (type
);
14598 /* Can't handle incomplete types nor sizes that are not
14600 if (!COMPLETE_TYPE_P (type
)
14601 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14604 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
14607 || !TYPE_MAX_VALUE (index
)
14608 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
14609 || !TYPE_MIN_VALUE (index
)
14610 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
14614 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
14615 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
14617 /* There must be no padding. */
14618 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14619 count
* GET_MODE_BITSIZE (*modep
)))
14631 /* Can't handle incomplete types nor sizes that are not
14633 if (!COMPLETE_TYPE_P (type
)
14634 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14637 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14639 if (TREE_CODE (field
) != FIELD_DECL
)
14642 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14645 count
+= sub_count
;
14648 /* There must be no padding. */
14649 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14650 count
* GET_MODE_BITSIZE (*modep
)))
14657 case QUAL_UNION_TYPE
:
14659 /* These aren't very interesting except in a degenerate case. */
14664 /* Can't handle incomplete types nor sizes that are not
14666 if (!COMPLETE_TYPE_P (type
)
14667 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
14670 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
14672 if (TREE_CODE (field
) != FIELD_DECL
)
14675 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
14678 count
= count
> sub_count
? count
: sub_count
;
14681 /* There must be no padding. */
14682 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
14683 count
* GET_MODE_BITSIZE (*modep
)))
14696 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14697 type as described in AAPCS64 \S 4.1.2.
14699 See the comment above aarch64_composite_type_p for the notes on MODE. */
14702 aarch64_short_vector_p (const_tree type
,
14705 poly_int64 size
= -1;
14707 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
14708 size
= int_size_in_bytes (type
);
14709 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
14710 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
14711 size
= GET_MODE_SIZE (mode
);
14713 return known_eq (size
, 8) || known_eq (size
, 16);
14716 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14717 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14718 array types. The C99 floating-point complex types are also considered
14719 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14720 types, which are GCC extensions and out of the scope of AAPCS64, are
14721 treated as composite types here as well.
14723 Note that MODE itself is not sufficient in determining whether a type
14724 is such a composite type or not. This is because
14725 stor-layout.c:compute_record_mode may have already changed the MODE
14726 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14727 structure with only one field may have its MODE set to the mode of the
14728 field. Also an integer mode whose size matches the size of the
14729 RECORD_TYPE type may be used to substitute the original mode
14730 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14731 solely relied on. */
14734 aarch64_composite_type_p (const_tree type
,
14737 if (aarch64_short_vector_p (type
, mode
))
14740 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
14743 if (mode
== BLKmode
14744 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
14745 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
14751 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14752 shall be passed or returned in simd/fp register(s) (providing these
14753 parameter passing registers are available).
14755 Upon successful return, *COUNT returns the number of needed registers,
14756 *BASE_MODE returns the mode of the individual register and when IS_HAF
14757 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14758 floating-point aggregate or a homogeneous short-vector aggregate. */
14761 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
14763 machine_mode
*base_mode
,
14767 machine_mode new_mode
= VOIDmode
;
14768 bool composite_p
= aarch64_composite_type_p (type
, mode
);
14770 if (is_ha
!= NULL
) *is_ha
= false;
14772 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
14773 || aarch64_short_vector_p (type
, mode
))
14778 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
14780 if (is_ha
!= NULL
) *is_ha
= true;
14782 new_mode
= GET_MODE_INNER (mode
);
14784 else if (type
&& composite_p
)
14786 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
14788 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
14790 if (is_ha
!= NULL
) *is_ha
= true;
14799 *base_mode
= new_mode
;
14803 /* Implement TARGET_STRUCT_VALUE_RTX. */
14806 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
14807 int incoming ATTRIBUTE_UNUSED
)
14809 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
14812 /* Implements target hook vector_mode_supported_p. */
14814 aarch64_vector_mode_supported_p (machine_mode mode
)
14816 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
14817 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
14820 /* Return the full-width SVE vector mode for element mode MODE, if one
14823 aarch64_full_sve_mode (scalar_mode mode
)
14840 return VNx16QImode
;
14842 return opt_machine_mode ();
14846 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14849 aarch64_vq_mode (scalar_mode mode
)
14868 return opt_machine_mode ();
14872 /* Return appropriate SIMD container
14873 for MODE within a vector of WIDTH bits. */
14874 static machine_mode
14875 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
14877 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
14878 return aarch64_full_sve_mode (mode
).else_mode (word_mode
);
14880 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
14883 if (known_eq (width
, 128))
14884 return aarch64_vq_mode (mode
).else_mode (word_mode
);
14905 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14906 static machine_mode
14907 aarch64_preferred_simd_mode (scalar_mode mode
)
14909 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
14910 return aarch64_simd_container_mode (mode
, bits
);
14913 /* Return a list of possible vector sizes for the vectorizer
14914 to iterate over. */
14916 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
, bool)
14919 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
14920 sizes
->safe_push (16);
14921 sizes
->safe_push (8);
14924 /* Implement TARGET_MANGLE_TYPE. */
14926 static const char *
14927 aarch64_mangle_type (const_tree type
)
14929 /* The AArch64 ABI documents say that "__va_list" has to be
14930 mangled as if it is in the "std" namespace. */
14931 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
14932 return "St9__va_list";
14934 /* Half-precision float. */
14935 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
14938 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14940 if (TYPE_NAME (type
) != NULL
)
14941 return aarch64_mangle_builtin_type (type
);
14943 /* Use the default mangling. */
14947 /* Find the first rtx_insn before insn that will generate an assembly
14951 aarch64_prev_real_insn (rtx_insn
*insn
)
14958 insn
= prev_real_insn (insn
);
14960 while (insn
&& recog_memoized (insn
) < 0);
14966 is_madd_op (enum attr_type t1
)
14969 /* A number of these may be AArch32 only. */
14970 enum attr_type mlatypes
[] = {
14971 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
14972 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
14973 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
14976 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
14978 if (t1
== mlatypes
[i
])
14985 /* Check if there is a register dependency between a load and the insn
14986 for which we hold recog_data. */
14989 dep_between_memop_and_curr (rtx memop
)
14994 gcc_assert (GET_CODE (memop
) == SET
);
14996 if (!REG_P (SET_DEST (memop
)))
14999 load_reg
= SET_DEST (memop
);
15000 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
15002 rtx operand
= recog_data
.operand
[opno
];
15003 if (REG_P (operand
)
15004 && reg_overlap_mentioned_p (load_reg
, operand
))
15012 /* When working around the Cortex-A53 erratum 835769,
15013 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15014 instruction and has a preceding memory instruction such that a NOP
15015 should be inserted between them. */
15018 aarch64_madd_needs_nop (rtx_insn
* insn
)
15020 enum attr_type attr_type
;
15024 if (!TARGET_FIX_ERR_A53_835769
)
15027 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
15030 attr_type
= get_attr_type (insn
);
15031 if (!is_madd_op (attr_type
))
15034 prev
= aarch64_prev_real_insn (insn
);
15035 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15036 Restore recog state to INSN to avoid state corruption. */
15037 extract_constrain_insn_cached (insn
);
15039 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
15042 body
= single_set (prev
);
15044 /* If the previous insn is a memory op and there is no dependency between
15045 it and the DImode madd, emit a NOP between them. If body is NULL then we
15046 have a complex memory operation, probably a load/store pair.
15047 Be conservative for now and emit a NOP. */
15048 if (GET_MODE (recog_data
.operand
[0]) == DImode
15049 && (!body
|| !dep_between_memop_and_curr (body
)))
15057 /* Implement FINAL_PRESCAN_INSN. */
15060 aarch64_final_prescan_insn (rtx_insn
*insn
)
15062 if (aarch64_madd_needs_nop (insn
))
15063 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
15067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15071 aarch64_sve_index_immediate_p (rtx base_or_step
)
15073 return (CONST_INT_P (base_or_step
)
15074 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
15077 /* Return true if X is a valid immediate for the SVE ADD and SUB
15078 instructions. Negate X first if NEGATE_P is true. */
15081 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
15085 if (!const_vec_duplicate_p (x
, &elt
)
15086 || !CONST_INT_P (elt
))
15089 HOST_WIDE_INT val
= INTVAL (elt
);
15092 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
15095 return IN_RANGE (val
, 0, 0xff);
15096 return IN_RANGE (val
, 0, 0xff00);
15099 /* Return true if X is a valid immediate operand for an SVE logical
15100 instruction such as AND. */
15103 aarch64_sve_bitmask_immediate_p (rtx x
)
15107 return (const_vec_duplicate_p (x
, &elt
)
15108 && CONST_INT_P (elt
)
15109 && aarch64_bitmask_imm (INTVAL (elt
),
15110 GET_MODE_INNER (GET_MODE (x
))));
15113 /* Return true if X is a valid immediate for the SVE DUP and CPY
15117 aarch64_sve_dup_immediate_p (rtx x
)
15121 if (!const_vec_duplicate_p (x
, &elt
)
15122 || !CONST_INT_P (elt
))
15125 HOST_WIDE_INT val
= INTVAL (elt
);
15127 return IN_RANGE (val
, -0x80, 0x7f);
15128 return IN_RANGE (val
, -0x8000, 0x7f00);
15131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15132 SIGNED_P says whether the operand is signed rather than unsigned. */
15135 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
15139 return (const_vec_duplicate_p (x
, &elt
)
15140 && CONST_INT_P (elt
)
15142 ? IN_RANGE (INTVAL (elt
), -16, 15)
15143 : IN_RANGE (INTVAL (elt
), 0, 127)));
15146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15147 instruction. Negate X first if NEGATE_P is true. */
15150 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
15155 if (!const_vec_duplicate_p (x
, &elt
)
15156 || GET_CODE (elt
) != CONST_DOUBLE
)
15159 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
15162 r
= real_value_negate (&r
);
15164 if (real_equal (&r
, &dconst1
))
15166 if (real_equal (&r
, &dconsthalf
))
15171 /* Return true if X is a valid immediate operand for an SVE FMUL
15175 aarch64_sve_float_mul_immediate_p (rtx x
)
15179 /* GCC will never generate a multiply with an immediate of 2, so there is no
15180 point testing for it (even though it is a valid constant). */
15181 return (const_vec_duplicate_p (x
, &elt
)
15182 && GET_CODE (elt
) == CONST_DOUBLE
15183 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
15186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15188 is nonnull, use it to describe valid immediates. */
15190 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
15191 simd_immediate_info
*info
,
15192 enum simd_immediate_check which
,
15193 simd_immediate_info::insn_type insn
)
15195 /* Try a 4-byte immediate with LSL. */
15196 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
15197 if ((val32
& (0xff << shift
)) == val32
)
15200 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15201 simd_immediate_info::LSL
, shift
);
15205 /* Try a 2-byte immediate with LSL. */
15206 unsigned int imm16
= val32
& 0xffff;
15207 if (imm16
== (val32
>> 16))
15208 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
15209 if ((imm16
& (0xff << shift
)) == imm16
)
15212 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
15213 simd_immediate_info::LSL
, shift
);
15217 /* Try a 4-byte immediate with MSL, except for cases that MVN
15219 if (which
== AARCH64_CHECK_MOV
)
15220 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
15222 unsigned int low
= (1 << shift
) - 1;
15223 if (((val32
& (0xff << shift
)) | low
) == val32
)
15226 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
15227 simd_immediate_info::MSL
, shift
);
15235 /* Return true if replicating VAL64 is a valid immediate for the
15236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15237 use it to describe valid immediates. */
15239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
15240 simd_immediate_info
*info
,
15241 enum simd_immediate_check which
)
15243 unsigned int val32
= val64
& 0xffffffff;
15244 unsigned int val16
= val64
& 0xffff;
15245 unsigned int val8
= val64
& 0xff;
15247 if (val32
== (val64
>> 32))
15249 if ((which
& AARCH64_CHECK_ORR
) != 0
15250 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
15251 simd_immediate_info::MOV
))
15254 if ((which
& AARCH64_CHECK_BIC
) != 0
15255 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
15256 simd_immediate_info::MVN
))
15259 /* Try using a replicated byte. */
15260 if (which
== AARCH64_CHECK_MOV
15261 && val16
== (val32
>> 16)
15262 && val8
== (val16
>> 8))
15265 *info
= simd_immediate_info (QImode
, val8
);
15270 /* Try using a bit-to-bytemask. */
15271 if (which
== AARCH64_CHECK_MOV
)
15274 for (i
= 0; i
< 64; i
+= 8)
15276 unsigned char byte
= (val64
>> i
) & 0xff;
15277 if (byte
!= 0 && byte
!= 0xff)
15283 *info
= simd_immediate_info (DImode
, val64
);
15290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15291 instruction. If INFO is nonnull, use it to describe valid immediates. */
15294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
15295 simd_immediate_info
*info
)
15297 scalar_int_mode mode
= DImode
;
15298 unsigned int val32
= val64
& 0xffffffff;
15299 if (val32
== (val64
>> 32))
15302 unsigned int val16
= val32
& 0xffff;
15303 if (val16
== (val32
>> 16))
15306 unsigned int val8
= val16
& 0xff;
15307 if (val8
== (val16
>> 8))
15311 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
15312 if (IN_RANGE (val
, -0x80, 0x7f))
15314 /* DUP with no shift. */
15316 *info
= simd_immediate_info (mode
, val
);
15319 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
15321 /* DUP with LSL #8. */
15323 *info
= simd_immediate_info (mode
, val
);
15326 if (aarch64_bitmask_imm (val64
, mode
))
15330 *info
= simd_immediate_info (mode
, val
);
15336 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15337 it to describe valid immediates. */
15340 aarch64_sve_pred_valid_immediate (rtx x
, simd_immediate_info
*info
)
15342 if (x
== CONST0_RTX (GET_MODE (x
)))
15345 *info
= simd_immediate_info (DImode
, 0);
15349 /* Analyze the value as a VNx16BImode. This should be relatively
15350 efficient, since rtx_vector_builder has enough built-in capacity
15351 to store all VLA predicate constants without needing the heap. */
15352 rtx_vector_builder builder
;
15353 if (!aarch64_get_sve_pred_bits (builder
, x
))
15356 unsigned int elt_size
= aarch64_widest_sve_pred_elt_size (builder
);
15357 if (int vl
= aarch64_partial_ptrue_length (builder
, elt_size
))
15359 machine_mode mode
= aarch64_sve_pred_mode (elt_size
).require ();
15360 aarch64_svpattern pattern
= aarch64_svpattern_for_vl (mode
, vl
);
15361 if (pattern
!= AARCH64_NUM_SVPATTERNS
)
15365 scalar_int_mode int_mode
= aarch64_sve_element_int_mode (mode
);
15366 *info
= simd_immediate_info (int_mode
, pattern
);
15374 /* Return true if OP is a valid SIMD immediate for the operation
15375 described by WHICH. If INFO is nonnull, use it to describe valid
15378 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
15379 enum simd_immediate_check which
)
15381 machine_mode mode
= GET_MODE (op
);
15382 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
15383 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
15386 if (vec_flags
& VEC_SVE_PRED
)
15387 return aarch64_sve_pred_valid_immediate (op
, info
);
15389 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
15391 unsigned int n_elts
;
15392 if (GET_CODE (op
) == CONST_VECTOR
15393 && CONST_VECTOR_DUPLICATE_P (op
))
15394 n_elts
= CONST_VECTOR_NPATTERNS (op
);
15395 else if ((vec_flags
& VEC_SVE_DATA
)
15396 && const_vec_series_p (op
, &base
, &step
))
15398 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
15399 if (!aarch64_sve_index_immediate_p (base
)
15400 || !aarch64_sve_index_immediate_p (step
))
15404 *info
= simd_immediate_info (elt_mode
, base
, step
);
15407 else if (GET_CODE (op
) == CONST_VECTOR
15408 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
15409 /* N_ELTS set above. */;
15413 scalar_float_mode elt_float_mode
;
15415 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
15417 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
15418 if (aarch64_float_const_zero_rtx_p (elt
)
15419 || aarch64_float_const_representable_p (elt
))
15422 *info
= simd_immediate_info (elt_float_mode
, elt
);
15427 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
15431 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
15433 /* Expand the vector constant out into a byte vector, with the least
15434 significant byte of the register first. */
15435 auto_vec
<unsigned char, 16> bytes
;
15436 bytes
.reserve (n_elts
* elt_size
);
15437 for (unsigned int i
= 0; i
< n_elts
; i
++)
15439 /* The vector is provided in gcc endian-neutral fashion.
15440 For aarch64_be Advanced SIMD, it must be laid out in the vector
15441 register in reverse order. */
15442 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
15443 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
15445 if (elt_mode
!= elt_int_mode
)
15446 elt
= gen_lowpart (elt_int_mode
, elt
);
15448 if (!CONST_INT_P (elt
))
15451 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
15452 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
15454 bytes
.quick_push (elt_val
& 0xff);
15455 elt_val
>>= BITS_PER_UNIT
;
15459 /* The immediate must repeat every eight bytes. */
15460 unsigned int nbytes
= bytes
.length ();
15461 for (unsigned i
= 8; i
< nbytes
; ++i
)
15462 if (bytes
[i
] != bytes
[i
- 8])
15465 /* Get the repeating 8-byte value as an integer. No endian correction
15466 is needed here because bytes is already in lsb-first order. */
15467 unsigned HOST_WIDE_INT val64
= 0;
15468 for (unsigned int i
= 0; i
< 8; i
++)
15469 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
15470 << (i
* BITS_PER_UNIT
));
15472 if (vec_flags
& VEC_SVE_DATA
)
15473 return aarch64_sve_valid_immediate (val64
, info
);
15475 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
15478 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15479 has a step in the range of INDEX. Return the index expression if so,
15480 otherwise return null. */
15482 aarch64_check_zero_based_sve_index_immediate (rtx x
)
15485 if (const_vec_series_p (x
, &base
, &step
)
15486 && base
== const0_rtx
15487 && aarch64_sve_index_immediate_p (step
))
15492 /* Check of immediate shift constants are within range. */
15494 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
15496 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
15498 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
15500 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
15503 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15504 operation of width WIDTH at bit position POS. */
15507 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
15509 gcc_assert (CONST_INT_P (width
));
15510 gcc_assert (CONST_INT_P (pos
));
15512 unsigned HOST_WIDE_INT mask
15513 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
15514 return GEN_INT (mask
<< UINTVAL (pos
));
15518 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
15520 if (GET_CODE (x
) == HIGH
15521 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
15524 if (CONST_INT_P (x
))
15527 if (VECTOR_MODE_P (GET_MODE (x
)))
15529 /* Require predicate constants to be VNx16BI before RA, so that we
15530 force everything to have a canonical form. */
15531 if (!lra_in_progress
15532 && !reload_completed
15533 && GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_BOOL
15534 && GET_MODE (x
) != VNx16BImode
)
15537 return aarch64_simd_valid_immediate (x
, NULL
);
15540 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
15543 if (aarch64_sve_cnt_immediate_p (x
))
15546 return aarch64_classify_symbolic_expression (x
)
15547 == SYMBOL_TINY_ABSOLUTE
;
15550 /* Return a const_int vector of VAL. */
15552 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
15554 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
15555 return gen_const_vec_duplicate (mode
, c
);
15558 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15561 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
15563 machine_mode vmode
;
15565 vmode
= aarch64_simd_container_mode (mode
, 64);
15566 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
15567 return aarch64_simd_valid_immediate (op_v
, NULL
);
15570 /* Construct and return a PARALLEL RTX vector with elements numbering the
15571 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15572 the vector - from the perspective of the architecture. This does not
15573 line up with GCC's perspective on lane numbers, so we end up with
15574 different masks depending on our target endian-ness. The diagram
15575 below may help. We must draw the distinction when building masks
15576 which select one half of the vector. An instruction selecting
15577 architectural low-lanes for a big-endian target, must be described using
15578 a mask selecting GCC high-lanes.
15580 Big-Endian Little-Endian
15582 GCC 0 1 2 3 3 2 1 0
15583 | x | x | x | x | | x | x | x | x |
15584 Architecture 3 2 1 0 3 2 1 0
15586 Low Mask: { 2, 3 } { 0, 1 }
15587 High Mask: { 0, 1 } { 2, 3 }
15589 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15592 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
15594 rtvec v
= rtvec_alloc (nunits
/ 2);
15595 int high_base
= nunits
/ 2;
15601 if (BYTES_BIG_ENDIAN
)
15602 base
= high
? low_base
: high_base
;
15604 base
= high
? high_base
: low_base
;
15606 for (i
= 0; i
< nunits
/ 2; i
++)
15607 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
15609 t1
= gen_rtx_PARALLEL (mode
, v
);
15613 /* Check OP for validity as a PARALLEL RTX vector with elements
15614 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15615 from the perspective of the architecture. See the diagram above
15616 aarch64_simd_vect_par_cnst_half for more details. */
15619 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
15623 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
15626 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
15627 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
15628 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
15631 if (count_op
!= count_ideal
)
15634 for (i
= 0; i
< count_ideal
; i
++)
15636 rtx elt_op
= XVECEXP (op
, 0, i
);
15637 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
15639 if (!CONST_INT_P (elt_op
)
15640 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
15646 /* Return a PARALLEL containing NELTS elements, with element I equal
15647 to BASE + I * STEP. */
15650 aarch64_gen_stepped_int_parallel (unsigned int nelts
, int base
, int step
)
15652 rtvec vec
= rtvec_alloc (nelts
);
15653 for (unsigned int i
= 0; i
< nelts
; ++i
)
15654 RTVEC_ELT (vec
, i
) = gen_int_mode (base
+ i
* step
, DImode
);
15655 return gen_rtx_PARALLEL (VOIDmode
, vec
);
15658 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15659 series with step STEP. */
15662 aarch64_stepped_int_parallel_p (rtx op
, int step
)
15664 if (GET_CODE (op
) != PARALLEL
|| !CONST_INT_P (XVECEXP (op
, 0, 0)))
15667 unsigned HOST_WIDE_INT base
= UINTVAL (XVECEXP (op
, 0, 0));
15668 for (int i
= 1; i
< XVECLEN (op
, 0); ++i
)
15669 if (!CONST_INT_P (XVECEXP (op
, 0, i
))
15670 || UINTVAL (XVECEXP (op
, 0, i
)) != base
+ i
* step
)
15676 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15677 HIGH (exclusive). */
15679 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
15682 HOST_WIDE_INT lane
;
15683 gcc_assert (CONST_INT_P (operand
));
15684 lane
= INTVAL (operand
);
15686 if (lane
< low
|| lane
>= high
)
15689 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
15691 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
15695 /* Peform endian correction on lane number N, which indexes a vector
15696 of mode MODE, and return the result as an SImode rtx. */
15699 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
15701 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
15704 /* Return TRUE if OP is a valid vector addressing mode. */
15707 aarch64_simd_mem_operand_p (rtx op
)
15709 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
15710 || REG_P (XEXP (op
, 0)));
15713 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15716 aarch64_sve_ld1r_operand_p (rtx op
)
15718 struct aarch64_address_info addr
;
15722 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
15723 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
15724 && addr
.type
== ADDRESS_REG_IMM
15725 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
15728 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15730 aarch64_sve_ld1rq_operand_p (rtx op
)
15732 struct aarch64_address_info addr
;
15733 scalar_mode elem_mode
= GET_MODE_INNER (GET_MODE (op
));
15735 || !aarch64_classify_address (&addr
, XEXP (op
, 0), elem_mode
, false))
15738 if (addr
.type
== ADDRESS_REG_IMM
)
15739 return offset_4bit_signed_scaled_p (TImode
, addr
.const_offset
);
15741 if (addr
.type
== ADDRESS_REG_REG
)
15742 return (1U << addr
.shift
) == GET_MODE_SIZE (elem_mode
);
15747 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15748 The conditions for STR are the same. */
15750 aarch64_sve_ldr_operand_p (rtx op
)
15752 struct aarch64_address_info addr
;
15755 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
15756 false, ADDR_QUERY_ANY
)
15757 && addr
.type
== ADDRESS_REG_IMM
);
15760 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15761 We need to be able to access the individual pieces, so the range
15762 is different from LD[234] and ST[234]. */
15764 aarch64_sve_struct_memory_operand_p (rtx op
)
15769 machine_mode mode
= GET_MODE (op
);
15770 struct aarch64_address_info addr
;
15771 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
15773 || addr
.type
!= ADDRESS_REG_IMM
)
15776 poly_int64 first
= addr
.const_offset
;
15777 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
15778 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
15779 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
15782 /* Emit a register copy from operand to operand, taking care not to
15783 early-clobber source registers in the process.
15785 COUNT is the number of components into which the copy needs to be
15788 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
15789 unsigned int count
)
15792 int rdest
= REGNO (operands
[0]);
15793 int rsrc
= REGNO (operands
[1]);
15795 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
15797 for (i
= 0; i
< count
; i
++)
15798 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
15799 gen_rtx_REG (mode
, rsrc
+ i
));
15801 for (i
= 0; i
< count
; i
++)
15802 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
15803 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
15806 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15807 one of VSTRUCT modes: OI, CI, or XI. */
15809 aarch64_simd_attr_length_rglist (machine_mode mode
)
15811 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15812 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
15815 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15816 alignment of a vector to 128 bits. SVE predicates have an alignment of
15818 static HOST_WIDE_INT
15819 aarch64_simd_vector_alignment (const_tree type
)
15821 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
15822 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15823 be set for non-predicate vectors of booleans. Modes are the most
15824 direct way we have of identifying real SVE predicate types. */
15825 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
15826 return wi::umin (wi::to_wide (TYPE_SIZE (type
)), 128).to_uhwi ();
15829 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15831 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
15833 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
15835 /* If the length of the vector is fixed, try to align to that length,
15836 otherwise don't try to align at all. */
15837 HOST_WIDE_INT result
;
15838 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
15839 result
= TYPE_ALIGN (TREE_TYPE (type
));
15842 return TYPE_ALIGN (type
);
15845 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15847 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
15852 /* For fixed-length vectors, check that the vectorizer will aim for
15853 full-vector alignment. This isn't true for generic GCC vectors
15854 that are wider than the ABI maximum of 128 bits. */
15855 poly_uint64 preferred_alignment
=
15856 aarch64_vectorize_preferred_vector_alignment (type
);
15857 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
15858 && maybe_ne (wi::to_widest (TYPE_SIZE (type
)),
15859 preferred_alignment
))
15862 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15866 /* Return true if the vector misalignment factor is supported by the
15869 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
15870 const_tree type
, int misalignment
,
15873 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
15875 /* Return if movmisalign pattern is not supported for this mode. */
15876 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
15879 /* Misalignment factor is unknown at compile time. */
15880 if (misalignment
== -1)
15883 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
15887 /* If VALS is a vector constant that can be loaded into a register
15888 using DUP, generate instructions to do so and return an RTX to
15889 assign to the register. Otherwise return NULL_RTX. */
15891 aarch64_simd_dup_constant (rtx vals
)
15893 machine_mode mode
= GET_MODE (vals
);
15894 machine_mode inner_mode
= GET_MODE_INNER (mode
);
15897 if (!const_vec_duplicate_p (vals
, &x
))
15900 /* We can load this constant by using DUP and a constant in a
15901 single ARM register. This will be cheaper than a vector
15903 x
= copy_to_mode_reg (inner_mode
, x
);
15904 return gen_vec_duplicate (mode
, x
);
15908 /* Generate code to load VALS, which is a PARALLEL containing only
15909 constants (for vec_init) or CONST_VECTOR, efficiently into a
15910 register. Returns an RTX to copy into the register, or NULL_RTX
15911 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15913 aarch64_simd_make_constant (rtx vals
)
15915 machine_mode mode
= GET_MODE (vals
);
15917 rtx const_vec
= NULL_RTX
;
15921 if (GET_CODE (vals
) == CONST_VECTOR
)
15923 else if (GET_CODE (vals
) == PARALLEL
)
15925 /* A CONST_VECTOR must contain only CONST_INTs and
15926 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15927 Only store valid constants in a CONST_VECTOR. */
15928 int n_elts
= XVECLEN (vals
, 0);
15929 for (i
= 0; i
< n_elts
; ++i
)
15931 rtx x
= XVECEXP (vals
, 0, i
);
15932 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
15935 if (n_const
== n_elts
)
15936 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
15939 gcc_unreachable ();
15941 if (const_vec
!= NULL_RTX
15942 && aarch64_simd_valid_immediate (const_vec
, NULL
))
15943 /* Load using MOVI/MVNI. */
15945 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
15946 /* Loaded using DUP. */
15948 else if (const_vec
!= NULL_RTX
)
15949 /* Load from constant pool. We cannot take advantage of single-cycle
15950 LD1 because we need a PC-relative addressing mode. */
15953 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15954 We cannot construct an initializer. */
15958 /* Expand a vector initialisation sequence, such that TARGET is
15959 initialised to contain VALS. */
15962 aarch64_expand_vector_init (rtx target
, rtx vals
)
15964 machine_mode mode
= GET_MODE (target
);
15965 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
15966 /* The number of vector elements. */
15967 int n_elts
= XVECLEN (vals
, 0);
15968 /* The number of vector elements which are not constant. */
15970 rtx any_const
= NULL_RTX
;
15971 /* The first element of vals. */
15972 rtx v0
= XVECEXP (vals
, 0, 0);
15973 bool all_same
= true;
15975 /* This is a special vec_init<M><N> where N is not an element mode but a
15976 vector mode with half the elements of M. We expect to find two entries
15977 of mode N in VALS and we must put their concatentation into TARGET. */
15978 if (XVECLEN (vals
, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals
, 0, 0))))
15980 gcc_assert (known_eq (GET_MODE_SIZE (mode
),
15981 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals
, 0, 0)))));
15982 rtx lo
= XVECEXP (vals
, 0, 0);
15983 rtx hi
= XVECEXP (vals
, 0, 1);
15984 machine_mode narrow_mode
= GET_MODE (lo
);
15985 gcc_assert (GET_MODE_INNER (narrow_mode
) == inner_mode
);
15986 gcc_assert (narrow_mode
== GET_MODE (hi
));
15988 /* When we want to concatenate a half-width vector with zeroes we can
15989 use the aarch64_combinez[_be] patterns. Just make sure that the
15990 zeroes are in the right half. */
15991 if (BYTES_BIG_ENDIAN
15992 && aarch64_simd_imm_zero (lo
, narrow_mode
)
15993 && general_operand (hi
, narrow_mode
))
15994 emit_insn (gen_aarch64_combinez_be (narrow_mode
, target
, hi
, lo
));
15995 else if (!BYTES_BIG_ENDIAN
15996 && aarch64_simd_imm_zero (hi
, narrow_mode
)
15997 && general_operand (lo
, narrow_mode
))
15998 emit_insn (gen_aarch64_combinez (narrow_mode
, target
, lo
, hi
));
16001 /* Else create the two half-width registers and combine them. */
16003 lo
= force_reg (GET_MODE (lo
), lo
);
16005 hi
= force_reg (GET_MODE (hi
), hi
);
16007 if (BYTES_BIG_ENDIAN
)
16008 std::swap (lo
, hi
);
16009 emit_insn (gen_aarch64_simd_combine (narrow_mode
, target
, lo
, hi
));
16014 /* Count the number of variable elements to initialise. */
16015 for (int i
= 0; i
< n_elts
; ++i
)
16017 rtx x
= XVECEXP (vals
, 0, i
);
16018 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
16023 all_same
&= rtx_equal_p (x
, v0
);
16026 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16027 how best to handle this. */
16030 rtx constant
= aarch64_simd_make_constant (vals
);
16031 if (constant
!= NULL_RTX
)
16033 emit_move_insn (target
, constant
);
16038 /* Splat a single non-constant element if we can. */
16041 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
16042 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16046 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
16047 gcc_assert (icode
!= CODE_FOR_nothing
);
16049 /* If there are only variable elements, try to optimize
16050 the insertion using dup for the most common element
16051 followed by insertions. */
16053 /* The algorithm will fill matches[*][0] with the earliest matching element,
16054 and matches[X][1] with the count of duplicate elements (if X is the
16055 earliest element which has duplicates). */
16057 if (n_var
== n_elts
&& n_elts
<= 16)
16059 int matches
[16][2] = {0};
16060 for (int i
= 0; i
< n_elts
; i
++)
16062 for (int j
= 0; j
<= i
; j
++)
16064 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
16072 int maxelement
= 0;
16074 for (int i
= 0; i
< n_elts
; i
++)
16075 if (matches
[i
][1] > maxv
)
16078 maxv
= matches
[i
][1];
16081 /* Create a duplicate of the most common element, unless all elements
16082 are equally useless to us, in which case just immediately set the
16083 vector register using the first element. */
16087 /* For vectors of two 64-bit elements, we can do even better. */
16089 && (inner_mode
== E_DImode
16090 || inner_mode
== E_DFmode
))
16093 rtx x0
= XVECEXP (vals
, 0, 0);
16094 rtx x1
= XVECEXP (vals
, 0, 1);
16095 /* Combine can pick up this case, but handling it directly
16096 here leaves clearer RTL.
16098 This is load_pair_lanes<mode>, and also gives us a clean-up
16099 for store_pair_lanes<mode>. */
16100 if (memory_operand (x0
, inner_mode
)
16101 && memory_operand (x1
, inner_mode
)
16102 && !STRICT_ALIGNMENT
16103 && rtx_equal_p (XEXP (x1
, 0),
16104 plus_constant (Pmode
,
16106 GET_MODE_SIZE (inner_mode
))))
16109 if (inner_mode
== DFmode
)
16110 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
16112 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
16117 /* The subreg-move sequence below will move into lane zero of the
16118 vector register. For big-endian we want that position to hold
16119 the last element of VALS. */
16120 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
16121 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16122 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
16126 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
16127 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
16130 /* Insert the rest. */
16131 for (int i
= 0; i
< n_elts
; i
++)
16133 rtx x
= XVECEXP (vals
, 0, i
);
16134 if (matches
[i
][0] == maxelement
)
16136 x
= copy_to_mode_reg (inner_mode
, x
);
16137 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16142 /* Initialise a vector which is part-variable. We want to first try
16143 to build those lanes which are constant in the most efficient way we
16145 if (n_var
!= n_elts
)
16147 rtx copy
= copy_rtx (vals
);
16149 /* Load constant part of vector. We really don't care what goes into the
16150 parts we will overwrite, but we're more likely to be able to load the
16151 constant efficiently if it has fewer, larger, repeating parts
16152 (see aarch64_simd_valid_immediate). */
16153 for (int i
= 0; i
< n_elts
; i
++)
16155 rtx x
= XVECEXP (vals
, 0, i
);
16156 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16158 rtx subst
= any_const
;
16159 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
16161 /* Look in the copied vector, as more elements are const. */
16162 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
16163 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
16169 XVECEXP (copy
, 0, i
) = subst
;
16171 aarch64_expand_vector_init (target
, copy
);
16174 /* Insert the variable lanes directly. */
16175 for (int i
= 0; i
< n_elts
; i
++)
16177 rtx x
= XVECEXP (vals
, 0, i
);
16178 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
16180 x
= copy_to_mode_reg (inner_mode
, x
);
16181 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
16185 /* Emit RTL corresponding to:
16186 insr TARGET, ELEM. */
16189 emit_insr (rtx target
, rtx elem
)
16191 machine_mode mode
= GET_MODE (target
);
16192 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16193 elem
= force_reg (elem_mode
, elem
);
16195 insn_code icode
= optab_handler (vec_shl_insert_optab
, mode
);
16196 gcc_assert (icode
!= CODE_FOR_nothing
);
16197 emit_insn (GEN_FCN (icode
) (target
, target
, elem
));
16200 /* Subroutine of aarch64_sve_expand_vector_init for handling
16201 trailing constants.
16202 This function works as follows:
16203 (a) Create a new vector consisting of trailing constants.
16204 (b) Initialize TARGET with the constant vector using emit_move_insn.
16205 (c) Insert remaining elements in TARGET using insr.
16206 NELTS is the total number of elements in original vector while
16207 while NELTS_REQD is the number of elements that are actually
16210 ??? The heuristic used is to do above only if number of constants
16211 is at least half the total number of elements. May need fine tuning. */
16214 aarch64_sve_expand_vector_init_handle_trailing_constants
16215 (rtx target
, const rtx_vector_builder
&builder
, int nelts
, int nelts_reqd
)
16217 machine_mode mode
= GET_MODE (target
);
16218 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16219 int n_trailing_constants
= 0;
16221 for (int i
= nelts_reqd
- 1;
16222 i
>= 0 && aarch64_legitimate_constant_p (elem_mode
, builder
.elt (i
));
16224 n_trailing_constants
++;
16226 if (n_trailing_constants
>= nelts_reqd
/ 2)
16228 rtx_vector_builder
v (mode
, 1, nelts
);
16229 for (int i
= 0; i
< nelts
; i
++)
16230 v
.quick_push (builder
.elt (i
+ nelts_reqd
- n_trailing_constants
));
16231 rtx const_vec
= v
.build ();
16232 emit_move_insn (target
, const_vec
);
16234 for (int i
= nelts_reqd
- n_trailing_constants
- 1; i
>= 0; i
--)
16235 emit_insr (target
, builder
.elt (i
));
16243 /* Subroutine of aarch64_sve_expand_vector_init.
16245 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16246 (b) Skip trailing elements from BUILDER, which are the same as
16247 element NELTS_REQD - 1.
16248 (c) Insert earlier elements in reverse order in TARGET using insr. */
16251 aarch64_sve_expand_vector_init_insert_elems (rtx target
,
16252 const rtx_vector_builder
&builder
,
16255 machine_mode mode
= GET_MODE (target
);
16256 scalar_mode elem_mode
= GET_MODE_INNER (mode
);
16258 struct expand_operand ops
[2];
16259 enum insn_code icode
= optab_handler (vec_duplicate_optab
, mode
);
16260 gcc_assert (icode
!= CODE_FOR_nothing
);
16262 create_output_operand (&ops
[0], target
, mode
);
16263 create_input_operand (&ops
[1], builder
.elt (nelts_reqd
- 1), elem_mode
);
16264 expand_insn (icode
, 2, ops
);
16266 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16267 for (int i
= nelts_reqd
- ndups
- 1; i
>= 0; i
--)
16268 emit_insr (target
, builder
.elt (i
));
16271 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16272 when all trailing elements of builder are same.
16273 This works as follows:
16274 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16275 (b) Insert remaining elements in TARGET using insr.
16277 ??? The heuristic used is to do above if number of same trailing elements
16278 is at least 3/4 of total number of elements, loosely based on
16279 heuristic from mostly_zeros_p. May need fine-tuning. */
16282 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16283 (rtx target
, const rtx_vector_builder
&builder
, int nelts_reqd
)
16285 int ndups
= builder
.count_dups (nelts_reqd
- 1, -1, -1);
16286 if (ndups
>= (3 * nelts_reqd
) / 4)
16288 aarch64_sve_expand_vector_init_insert_elems (target
, builder
,
16289 nelts_reqd
- ndups
+ 1);
16296 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16297 of elements in BUILDER.
16299 The function tries to initialize TARGET from BUILDER if it fits one
16300 of the special cases outlined below.
16302 Failing that, the function divides BUILDER into two sub-vectors:
16303 v_even = even elements of BUILDER;
16304 v_odd = odd elements of BUILDER;
16306 and recursively calls itself with v_even and v_odd.
16308 if (recursive call succeeded for v_even or v_odd)
16309 TARGET = zip (v_even, v_odd)
16311 The function returns true if it managed to build TARGET from BUILDER
16312 with one of the special cases, false otherwise.
16314 Example: {a, 1, b, 2, c, 3, d, 4}
16316 The vector gets divided into:
16317 v_even = {a, b, c, d}
16318 v_odd = {1, 2, 3, 4}
16320 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16321 initialize tmp2 from constant vector v_odd using emit_move_insn.
16323 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16324 4 elements, so we construct tmp1 from v_even using insr:
16331 TARGET = zip (tmp1, tmp2)
16332 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16335 aarch64_sve_expand_vector_init (rtx target
, const rtx_vector_builder
&builder
,
16336 int nelts
, int nelts_reqd
)
16338 machine_mode mode
= GET_MODE (target
);
16340 /* Case 1: Vector contains trailing constants. */
16342 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16343 (target
, builder
, nelts
, nelts_reqd
))
16346 /* Case 2: Vector contains leading constants. */
16348 rtx_vector_builder
rev_builder (mode
, 1, nelts_reqd
);
16349 for (int i
= 0; i
< nelts_reqd
; i
++)
16350 rev_builder
.quick_push (builder
.elt (nelts_reqd
- i
- 1));
16351 rev_builder
.finalize ();
16353 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16354 (target
, rev_builder
, nelts
, nelts_reqd
))
16356 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16360 /* Case 3: Vector contains trailing same element. */
16362 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16363 (target
, builder
, nelts_reqd
))
16366 /* Case 4: Vector contains leading same element. */
16368 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16369 (target
, rev_builder
, nelts_reqd
) && nelts_reqd
== nelts
)
16371 emit_insn (gen_aarch64_sve_rev (mode
, target
, target
));
16375 /* Avoid recursing below 4-elements.
16376 ??? The threshold 4 may need fine-tuning. */
16378 if (nelts_reqd
<= 4)
16381 rtx_vector_builder
v_even (mode
, 1, nelts
);
16382 rtx_vector_builder
v_odd (mode
, 1, nelts
);
16384 for (int i
= 0; i
< nelts
* 2; i
+= 2)
16386 v_even
.quick_push (builder
.elt (i
));
16387 v_odd
.quick_push (builder
.elt (i
+ 1));
16390 v_even
.finalize ();
16393 rtx tmp1
= gen_reg_rtx (mode
);
16394 bool did_even_p
= aarch64_sve_expand_vector_init (tmp1
, v_even
,
16395 nelts
, nelts_reqd
/ 2);
16397 rtx tmp2
= gen_reg_rtx (mode
);
16398 bool did_odd_p
= aarch64_sve_expand_vector_init (tmp2
, v_odd
,
16399 nelts
, nelts_reqd
/ 2);
16401 if (!did_even_p
&& !did_odd_p
)
16404 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16405 special cases and zip v_even, v_odd. */
16408 aarch64_sve_expand_vector_init_insert_elems (tmp1
, v_even
, nelts_reqd
/ 2);
16411 aarch64_sve_expand_vector_init_insert_elems (tmp2
, v_odd
, nelts_reqd
/ 2);
16413 rtvec v
= gen_rtvec (2, tmp1
, tmp2
);
16414 emit_set_insn (target
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
16418 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16421 aarch64_sve_expand_vector_init (rtx target
, rtx vals
)
16423 machine_mode mode
= GET_MODE (target
);
16424 int nelts
= XVECLEN (vals
, 0);
16426 rtx_vector_builder
v (mode
, 1, nelts
);
16427 for (int i
= 0; i
< nelts
; i
++)
16428 v
.quick_push (XVECEXP (vals
, 0, i
));
16431 /* If neither sub-vectors of v could be initialized specially,
16432 then use INSR to insert all elements from v into TARGET.
16433 ??? This might not be optimal for vectors with large
16434 initializers like 16-element or above.
16435 For nelts < 4, it probably isn't useful to handle specially. */
16438 || !aarch64_sve_expand_vector_init (target
, v
, nelts
, nelts
))
16439 aarch64_sve_expand_vector_init_insert_elems (target
, v
, nelts
);
16442 static unsigned HOST_WIDE_INT
16443 aarch64_shift_truncation_mask (machine_mode mode
)
16445 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
16447 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
16450 /* Select a format to encode pointers in exception handling data. */
16452 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
16455 switch (aarch64_cmodel
)
16457 case AARCH64_CMODEL_TINY
:
16458 case AARCH64_CMODEL_TINY_PIC
:
16459 case AARCH64_CMODEL_SMALL
:
16460 case AARCH64_CMODEL_SMALL_PIC
:
16461 case AARCH64_CMODEL_SMALL_SPIC
:
16462 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16464 type
= DW_EH_PE_sdata4
;
16467 /* No assumptions here. 8-byte relocs required. */
16468 type
= DW_EH_PE_sdata8
;
16471 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
16474 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16477 aarch64_asm_output_variant_pcs (FILE *stream
, const tree decl
, const char* name
)
16479 if (aarch64_simd_decl_p (decl
))
16481 fprintf (stream
, "\t.variant_pcs\t");
16482 assemble_name (stream
, name
);
16483 fprintf (stream
, "\n");
16487 /* The last .arch and .tune assembly strings that we printed. */
16488 static std::string aarch64_last_printed_arch_string
;
16489 static std::string aarch64_last_printed_tune_string
;
16491 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16492 by the function fndecl. */
16495 aarch64_declare_function_name (FILE *stream
, const char* name
,
16498 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
16500 struct cl_target_option
*targ_options
;
16502 targ_options
= TREE_TARGET_OPTION (target_parts
);
16504 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
16505 gcc_assert (targ_options
);
16507 const struct processor
*this_arch
16508 = aarch64_get_arch (targ_options
->x_explicit_arch
);
16510 uint64_t isa_flags
= targ_options
->x_aarch64_isa_flags
;
16511 std::string extension
16512 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
16514 /* Only update the assembler .arch string if it is distinct from the last
16515 such string we printed. */
16516 std::string to_print
= this_arch
->name
+ extension
;
16517 if (to_print
!= aarch64_last_printed_arch_string
)
16519 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
16520 aarch64_last_printed_arch_string
= to_print
;
16523 /* Print the cpu name we're tuning for in the comments, might be
16524 useful to readers of the generated asm. Do it only when it changes
16525 from function to function and verbose assembly is requested. */
16526 const struct processor
*this_tune
16527 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
16529 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
16531 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
16533 aarch64_last_printed_tune_string
= this_tune
->name
;
16536 aarch64_asm_output_variant_pcs (stream
, fndecl
, name
);
16538 /* Don't forget the type directive for ELF. */
16539 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
16540 ASM_OUTPUT_LABEL (stream
, name
);
16543 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16546 aarch64_asm_output_alias (FILE *stream
, const tree decl
, const tree target
)
16548 const char *name
= XSTR (XEXP (DECL_RTL (decl
), 0), 0);
16549 const char *value
= IDENTIFIER_POINTER (target
);
16550 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16551 ASM_OUTPUT_DEF (stream
, name
, value
);
16554 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16555 function symbol references. */
16558 aarch64_asm_output_external (FILE *stream
, tree decl
, const char* name
)
16560 default_elf_asm_output_external (stream
, decl
, name
);
16561 aarch64_asm_output_variant_pcs (stream
, decl
, name
);
16564 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16565 Used to output the .cfi_b_key_frame directive when signing the current
16566 function with the B key. */
16569 aarch64_post_cfi_startproc (FILE *f
, tree ignored ATTRIBUTE_UNUSED
)
16571 if (cfun
->machine
->frame
.laid_out
&& aarch64_return_address_signing_enabled ()
16572 && aarch64_ra_sign_key
== AARCH64_KEY_B
)
16573 asm_fprintf (f
, "\t.cfi_b_key_frame\n");
16576 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16579 aarch64_start_file (void)
16581 struct cl_target_option
*default_options
16582 = TREE_TARGET_OPTION (target_option_default_node
);
16584 const struct processor
*default_arch
16585 = aarch64_get_arch (default_options
->x_explicit_arch
);
16586 uint64_t default_isa_flags
= default_options
->x_aarch64_isa_flags
;
16587 std::string extension
16588 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
16589 default_arch
->flags
);
16591 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
16592 aarch64_last_printed_tune_string
= "";
16593 asm_fprintf (asm_out_file
, "\t.arch %s\n",
16594 aarch64_last_printed_arch_string
.c_str ());
16596 default_file_start ();
16599 /* Emit load exclusive. */
16602 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
16603 rtx mem
, rtx model_rtx
)
16605 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
16608 /* Emit store exclusive. */
16611 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
16612 rtx rval
, rtx mem
, rtx model_rtx
)
16614 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
16617 /* Mark the previous jump instruction as unlikely. */
16620 aarch64_emit_unlikely_jump (rtx insn
)
16622 rtx_insn
*jump
= emit_jump_insn (insn
);
16623 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
16626 /* Expand a compare and swap pattern. */
16629 aarch64_expand_compare_and_swap (rtx operands
[])
16631 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
, cc_reg
;
16632 machine_mode mode
, r_mode
;
16634 bval
= operands
[0];
16635 rval
= operands
[1];
16637 oldval
= operands
[3];
16638 newval
= operands
[4];
16639 is_weak
= operands
[5];
16640 mod_s
= operands
[6];
16641 mod_f
= operands
[7];
16642 mode
= GET_MODE (mem
);
16644 /* Normally the succ memory model must be stronger than fail, but in the
16645 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16646 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16647 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
16648 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
16649 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
16652 if (mode
== QImode
|| mode
== HImode
)
16655 rval
= gen_reg_rtx (r_mode
);
16660 /* The CAS insn requires oldval and rval overlap, but we need to
16661 have a copy of oldval saved across the operation to tell if
16662 the operation is successful. */
16663 if (reg_overlap_mentioned_p (rval
, oldval
))
16664 rval
= copy_to_mode_reg (r_mode
, oldval
);
16666 emit_move_insn (rval
, gen_lowpart (r_mode
, oldval
));
16668 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
,
16670 cc_reg
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16674 /* The oldval predicate varies by mode. Test it and force to reg. */
16675 insn_code code
= code_for_aarch64_compare_and_swap (mode
);
16676 if (!insn_data
[code
].operand
[2].predicate (oldval
, mode
))
16677 oldval
= force_reg (mode
, oldval
);
16679 emit_insn (GEN_FCN (code
) (rval
, mem
, oldval
, newval
,
16680 is_weak
, mod_s
, mod_f
));
16681 cc_reg
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16684 if (r_mode
!= mode
)
16685 rval
= gen_lowpart (mode
, rval
);
16686 emit_move_insn (operands
[1], rval
);
16688 x
= gen_rtx_EQ (SImode
, cc_reg
, const0_rtx
);
16689 emit_insn (gen_rtx_SET (bval
, x
));
16692 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16693 sequence implementing an atomic operation. */
16696 aarch64_emit_post_barrier (enum memmodel model
)
16698 const enum memmodel base_model
= memmodel_base (model
);
16700 if (is_mm_sync (model
)
16701 && (base_model
== MEMMODEL_ACQUIRE
16702 || base_model
== MEMMODEL_ACQ_REL
16703 || base_model
== MEMMODEL_SEQ_CST
))
16705 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
16709 /* Split a compare and swap pattern. */
16712 aarch64_split_compare_and_swap (rtx operands
[])
16714 rtx rval
, mem
, oldval
, newval
, scratch
;
16717 rtx_code_label
*label1
, *label2
;
16719 enum memmodel model
;
16722 rval
= operands
[0];
16724 oldval
= operands
[2];
16725 newval
= operands
[3];
16726 is_weak
= (operands
[4] != const0_rtx
);
16727 model_rtx
= operands
[5];
16728 scratch
= operands
[7];
16729 mode
= GET_MODE (mem
);
16730 model
= memmodel_from_int (INTVAL (model_rtx
));
16732 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16735 LD[A]XR rval, [mem]
16737 ST[L]XR scratch, newval, [mem]
16738 CBNZ scratch, .label1
16741 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
16746 label1
= gen_label_rtx ();
16747 emit_label (label1
);
16749 label2
= gen_label_rtx ();
16751 /* The initial load can be relaxed for a __sync operation since a final
16752 barrier will be emitted to stop code hoisting. */
16753 if (is_mm_sync (model
))
16754 aarch64_emit_load_exclusive (mode
, rval
, mem
,
16755 GEN_INT (MEMMODEL_RELAXED
));
16757 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
16761 if (aarch64_track_speculation
)
16763 /* Emit an explicit compare instruction, so that we can correctly
16764 track the condition codes. */
16765 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
16766 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16769 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
16771 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16772 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16773 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16777 cond
= aarch64_gen_compare_reg_maybe_ze (NE
, rval
, oldval
, mode
);
16778 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16779 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16780 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
16781 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16784 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
16788 if (aarch64_track_speculation
)
16790 /* Emit an explicit compare instruction, so that we can correctly
16791 track the condition codes. */
16792 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
16793 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16796 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
16798 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16799 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
16800 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16804 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16805 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
16806 emit_insn (gen_rtx_SET (cond
, x
));
16809 emit_label (label2
);
16810 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16811 to set the condition flags. If this is not used it will be removed by
16815 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
16816 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
16817 emit_insn (gen_rtx_SET (cond
, x
));
16819 /* Emit any final barrier needed for a __sync operation. */
16820 if (is_mm_sync (model
))
16821 aarch64_emit_post_barrier (model
);
16824 /* Split an atomic operation. */
16827 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
16828 rtx value
, rtx model_rtx
, rtx cond
)
16830 machine_mode mode
= GET_MODE (mem
);
16831 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
16832 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
16833 const bool is_sync
= is_mm_sync (model
);
16834 rtx_code_label
*label
;
16837 /* Split the atomic operation into a sequence. */
16838 label
= gen_label_rtx ();
16839 emit_label (label
);
16842 new_out
= gen_lowpart (wmode
, new_out
);
16844 old_out
= gen_lowpart (wmode
, old_out
);
16847 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
16849 /* The initial load can be relaxed for a __sync operation since a final
16850 barrier will be emitted to stop code hoisting. */
16852 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
16853 GEN_INT (MEMMODEL_RELAXED
));
16855 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
16864 x
= gen_rtx_AND (wmode
, old_out
, value
);
16865 emit_insn (gen_rtx_SET (new_out
, x
));
16866 x
= gen_rtx_NOT (wmode
, new_out
);
16867 emit_insn (gen_rtx_SET (new_out
, x
));
16871 if (CONST_INT_P (value
))
16873 value
= GEN_INT (-INTVAL (value
));
16876 /* Fall through. */
16879 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
16880 emit_insn (gen_rtx_SET (new_out
, x
));
16884 aarch64_emit_store_exclusive (mode
, cond
, mem
,
16885 gen_lowpart (mode
, new_out
), model_rtx
);
16887 if (aarch64_track_speculation
)
16889 /* Emit an explicit compare instruction, so that we can correctly
16890 track the condition codes. */
16891 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
16892 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
16895 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
16897 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16898 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
16899 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16901 /* Emit any final barrier needed for a __sync operation. */
16903 aarch64_emit_post_barrier (model
);
16907 aarch64_init_libfuncs (void)
16909 /* Half-precision float operations. The compiler handles all operations
16910 with NULL libfuncs by converting to SFmode. */
16913 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
16914 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
16917 set_optab_libfunc (add_optab
, HFmode
, NULL
);
16918 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
16919 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
16920 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
16921 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
16924 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
16925 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
16926 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
16927 set_optab_libfunc (le_optab
, HFmode
, NULL
);
16928 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
16929 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
16930 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
16933 /* Target hook for c_mode_for_suffix. */
16934 static machine_mode
16935 aarch64_c_mode_for_suffix (char suffix
)
16943 /* We can only represent floating point constants which will fit in
16944 "quarter-precision" values. These values are characterised by
16945 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16948 (-1)^s * (n/16) * 2^r
16951 's' is the sign bit.
16952 'n' is an integer in the range 16 <= n <= 31.
16953 'r' is an integer in the range -3 <= r <= 4. */
16955 /* Return true iff X can be represented by a quarter-precision
16956 floating point immediate operand X. Note, we cannot represent 0.0. */
16958 aarch64_float_const_representable_p (rtx x
)
16960 /* This represents our current view of how many bits
16961 make up the mantissa. */
16962 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
16964 unsigned HOST_WIDE_INT mantissa
, mask
;
16965 REAL_VALUE_TYPE r
, m
;
16968 if (!CONST_DOUBLE_P (x
))
16971 if (GET_MODE (x
) == VOIDmode
16972 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
16975 r
= *CONST_DOUBLE_REAL_VALUE (x
);
16977 /* We cannot represent infinities, NaNs or +/-zero. We won't
16978 know if we have +zero until we analyse the mantissa, but we
16979 can reject the other invalid values. */
16980 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
16981 || REAL_VALUE_MINUS_ZERO (r
))
16984 /* Extract exponent. */
16985 r
= real_value_abs (&r
);
16986 exponent
= REAL_EXP (&r
);
16988 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16989 highest (sign) bit, with a fixed binary point at bit point_pos.
16990 m1 holds the low part of the mantissa, m2 the high part.
16991 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16992 bits for the mantissa, this can fail (low bits will be lost). */
16993 real_ldexp (&m
, &r
, point_pos
- exponent
);
16994 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
16996 /* If the low part of the mantissa has bits set we cannot represent
16998 if (w
.ulow () != 0)
17000 /* We have rejected the lower HOST_WIDE_INT, so update our
17001 understanding of how many bits lie in the mantissa and
17002 look only at the high HOST_WIDE_INT. */
17003 mantissa
= w
.elt (1);
17004 point_pos
-= HOST_BITS_PER_WIDE_INT
;
17006 /* We can only represent values with a mantissa of the form 1.xxxx. */
17007 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
17008 if ((mantissa
& mask
) != 0)
17011 /* Having filtered unrepresentable values, we may now remove all
17012 but the highest 5 bits. */
17013 mantissa
>>= point_pos
- 5;
17015 /* We cannot represent the value 0.0, so reject it. This is handled
17020 /* Then, as bit 4 is always set, we can mask it off, leaving
17021 the mantissa in the range [0, 15]. */
17022 mantissa
&= ~(1 << 4);
17023 gcc_assert (mantissa
<= 15);
17025 /* GCC internally does not use IEEE754-like encoding (where normalized
17026 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17027 Our mantissa values are shifted 4 places to the left relative to
17028 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17029 by 5 places to correct for GCC's representation. */
17030 exponent
= 5 - exponent
;
17032 return (exponent
>= 0 && exponent
<= 7);
17035 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17036 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17037 output MOVI/MVNI, ORR or BIC immediate. */
17039 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
17040 enum simd_immediate_check which
)
17043 static char templ
[40];
17044 const char *mnemonic
;
17045 const char *shift_op
;
17046 unsigned int lane_count
= 0;
17049 struct simd_immediate_info info
;
17051 /* This will return true to show const_vector is legal for use as either
17052 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17053 It will also update INFO to show how the immediate should be generated.
17054 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17055 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
17056 gcc_assert (is_valid
);
17058 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17059 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
17061 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17063 gcc_assert (info
.insn
== simd_immediate_info::MOV
17064 && info
.u
.mov
.shift
== 0);
17065 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17066 move immediate path. */
17067 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17068 info
.u
.mov
.value
= GEN_INT (0);
17071 const unsigned int buf_size
= 20;
17072 char float_buf
[buf_size
] = {'\0'};
17073 real_to_decimal_for_mode (float_buf
,
17074 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17075 buf_size
, buf_size
, 1, info
.elt_mode
);
17077 if (lane_count
== 1)
17078 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
17080 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
17081 lane_count
, element_char
, float_buf
);
17086 gcc_assert (CONST_INT_P (info
.u
.mov
.value
));
17088 if (which
== AARCH64_CHECK_MOV
)
17090 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
17091 shift_op
= (info
.u
.mov
.modifier
== simd_immediate_info::MSL
17093 if (lane_count
== 1)
17094 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
17095 mnemonic
, UINTVAL (info
.u
.mov
.value
));
17096 else if (info
.u
.mov
.shift
)
17097 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17098 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
17099 element_char
, UINTVAL (info
.u
.mov
.value
), shift_op
,
17102 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
17103 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
17104 element_char
, UINTVAL (info
.u
.mov
.value
));
17108 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17109 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
17110 if (info
.u
.mov
.shift
)
17111 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17112 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
17113 element_char
, UINTVAL (info
.u
.mov
.value
), "lsl",
17116 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
17117 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
17118 element_char
, UINTVAL (info
.u
.mov
.value
));
17124 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
17127 /* If a floating point number was passed and we desire to use it in an
17128 integer mode do the conversion to integer. */
17129 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
17131 unsigned HOST_WIDE_INT ival
;
17132 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
17133 gcc_unreachable ();
17134 immediate
= gen_int_mode (ival
, mode
);
17137 machine_mode vmode
;
17138 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17139 a 128 bit vector mode. */
17140 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
17142 vmode
= aarch64_simd_container_mode (mode
, width
);
17143 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
17144 return aarch64_output_simd_mov_immediate (v_op
, width
);
17147 /* Return the output string to use for moving immediate CONST_VECTOR
17148 into an SVE register. */
17151 aarch64_output_sve_mov_immediate (rtx const_vector
)
17153 static char templ
[40];
17154 struct simd_immediate_info info
;
17157 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
17158 gcc_assert (is_valid
);
17160 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
17162 machine_mode vec_mode
= GET_MODE (const_vector
);
17163 if (aarch64_sve_pred_mode_p (vec_mode
))
17165 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
17166 if (info
.insn
== simd_immediate_info::MOV
)
17168 gcc_assert (info
.u
.mov
.value
== const0_rtx
);
17169 snprintf (buf
, sizeof (buf
), "pfalse\t%%0.b");
17173 gcc_assert (info
.insn
== simd_immediate_info::PTRUE
);
17174 unsigned int total_bytes
;
17175 if (info
.u
.pattern
== AARCH64_SV_ALL
17176 && BYTES_PER_SVE_VECTOR
.is_constant (&total_bytes
))
17177 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", element_char
,
17178 total_bytes
/ GET_MODE_SIZE (info
.elt_mode
));
17180 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, %s", element_char
,
17181 svpattern_token (info
.u
.pattern
));
17186 if (info
.insn
== simd_immediate_info::INDEX
)
17188 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
17189 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
17190 element_char
, INTVAL (info
.u
.index
.base
),
17191 INTVAL (info
.u
.index
.step
));
17195 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
17197 if (aarch64_float_const_zero_rtx_p (info
.u
.mov
.value
))
17198 info
.u
.mov
.value
= GEN_INT (0);
17201 const int buf_size
= 20;
17202 char float_buf
[buf_size
] = {};
17203 real_to_decimal_for_mode (float_buf
,
17204 CONST_DOUBLE_REAL_VALUE (info
.u
.mov
.value
),
17205 buf_size
, buf_size
, 1, info
.elt_mode
);
17207 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
17208 element_char
, float_buf
);
17213 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
17214 element_char
, INTVAL (info
.u
.mov
.value
));
17218 /* Split operands into moves from op[1] + op[2] into op[0]. */
17221 aarch64_split_combinev16qi (rtx operands
[3])
17223 unsigned int dest
= REGNO (operands
[0]);
17224 unsigned int src1
= REGNO (operands
[1]);
17225 unsigned int src2
= REGNO (operands
[2]);
17226 machine_mode halfmode
= GET_MODE (operands
[1]);
17227 unsigned int halfregs
= REG_NREGS (operands
[1]);
17228 rtx destlo
, desthi
;
17230 gcc_assert (halfmode
== V16QImode
);
17232 if (src1
== dest
&& src2
== dest
+ halfregs
)
17234 /* No-op move. Can't split to nothing; emit something. */
17235 emit_note (NOTE_INSN_DELETED
);
17239 /* Preserve register attributes for variable tracking. */
17240 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
17241 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
17242 GET_MODE_SIZE (halfmode
));
17244 /* Special case of reversed high/low parts. */
17245 if (reg_overlap_mentioned_p (operands
[2], destlo
)
17246 && reg_overlap_mentioned_p (operands
[1], desthi
))
17248 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17249 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
17250 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
17252 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
17254 /* Try to avoid unnecessary moves if part of the result
17255 is in the right place already. */
17257 emit_move_insn (destlo
, operands
[1]);
17258 if (src2
!= dest
+ halfregs
)
17259 emit_move_insn (desthi
, operands
[2]);
17263 if (src2
!= dest
+ halfregs
)
17264 emit_move_insn (desthi
, operands
[2]);
17266 emit_move_insn (destlo
, operands
[1]);
17270 /* vec_perm support. */
17272 struct expand_vec_perm_d
17274 rtx target
, op0
, op1
;
17275 vec_perm_indices perm
;
17276 machine_mode vmode
;
17277 unsigned int vec_flags
;
17282 /* Generate a variable permutation. */
17285 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17287 machine_mode vmode
= GET_MODE (target
);
17288 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17290 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
17291 gcc_checking_assert (GET_MODE (op0
) == vmode
);
17292 gcc_checking_assert (GET_MODE (op1
) == vmode
);
17293 gcc_checking_assert (GET_MODE (sel
) == vmode
);
17294 gcc_checking_assert (TARGET_SIMD
);
17298 if (vmode
== V8QImode
)
17300 /* Expand the argument to a V16QI mode by duplicating it. */
17301 rtx pair
= gen_reg_rtx (V16QImode
);
17302 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
17303 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17307 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
17314 if (vmode
== V8QImode
)
17316 pair
= gen_reg_rtx (V16QImode
);
17317 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
17318 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
17322 pair
= gen_reg_rtx (OImode
);
17323 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
17324 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
17329 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17330 NELT is the number of elements in the vector. */
17333 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
17336 machine_mode vmode
= GET_MODE (target
);
17337 bool one_vector_p
= rtx_equal_p (op0
, op1
);
17340 /* The TBL instruction does not use a modulo index, so we must take care
17341 of that ourselves. */
17342 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
17343 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
17344 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
17346 /* For big-endian, we also need to reverse the index within the vector
17347 (but not which vector). */
17348 if (BYTES_BIG_ENDIAN
)
17350 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17352 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
17353 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
17354 NULL
, 0, OPTAB_LIB_WIDEN
);
17356 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
17359 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17362 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
17364 emit_insn (gen_rtx_SET (target
,
17365 gen_rtx_UNSPEC (GET_MODE (target
),
17366 gen_rtvec (2, op0
, op1
), code
)));
17369 /* Expand an SVE vec_perm with the given operands. */
17372 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
17374 machine_mode data_mode
= GET_MODE (target
);
17375 machine_mode sel_mode
= GET_MODE (sel
);
17376 /* Enforced by the pattern condition. */
17377 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
17379 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17380 size of the two value vectors, i.e. the upper bits of the indices
17381 are effectively ignored. SVE TBL instead produces 0 for any
17382 out-of-range indices, so we need to modulo all the vec_perm indices
17383 to ensure they are all in range. */
17384 rtx sel_reg
= force_reg (sel_mode
, sel
);
17386 /* Check if the sel only references the first values vector. */
17387 if (GET_CODE (sel
) == CONST_VECTOR
17388 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
17390 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
17394 /* Check if the two values vectors are the same. */
17395 if (rtx_equal_p (op0
, op1
))
17397 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
17398 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17399 NULL
, 0, OPTAB_DIRECT
);
17400 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
17404 /* Run TBL on for each value vector and combine the results. */
17406 rtx res0
= gen_reg_rtx (data_mode
);
17407 rtx res1
= gen_reg_rtx (data_mode
);
17408 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
17409 if (GET_CODE (sel
) != CONST_VECTOR
17410 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
17412 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
17414 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
17415 NULL
, 0, OPTAB_DIRECT
);
17417 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
17418 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
17419 NULL
, 0, OPTAB_DIRECT
);
17420 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
17421 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
17422 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
17424 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
17427 /* Recognize patterns suitable for the TRN instructions. */
17429 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
17432 poly_uint64 nelt
= d
->perm
.length ();
17433 rtx out
, in0
, in1
, x
;
17434 machine_mode vmode
= d
->vmode
;
17436 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17439 /* Note that these are little-endian tests.
17440 We correct for big-endian later. */
17441 if (!d
->perm
[0].is_constant (&odd
)
17442 || (odd
!= 0 && odd
!= 1)
17443 || !d
->perm
.series_p (0, 2, odd
, 2)
17444 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
17453 /* We don't need a big-endian lane correction for SVE; see the comment
17454 at the head of aarch64-sve.md for details. */
17455 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17457 x
= in0
, in0
= in1
, in1
= x
;
17462 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17463 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
17467 /* Recognize patterns suitable for the UZP instructions. */
17469 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
17472 rtx out
, in0
, in1
, x
;
17473 machine_mode vmode
= d
->vmode
;
17475 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17478 /* Note that these are little-endian tests.
17479 We correct for big-endian later. */
17480 if (!d
->perm
[0].is_constant (&odd
)
17481 || (odd
!= 0 && odd
!= 1)
17482 || !d
->perm
.series_p (0, 1, odd
, 2))
17491 /* We don't need a big-endian lane correction for SVE; see the comment
17492 at the head of aarch64-sve.md for details. */
17493 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17495 x
= in0
, in0
= in1
, in1
= x
;
17500 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17501 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
17505 /* Recognize patterns suitable for the ZIP instructions. */
17507 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
17510 poly_uint64 nelt
= d
->perm
.length ();
17511 rtx out
, in0
, in1
, x
;
17512 machine_mode vmode
= d
->vmode
;
17514 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
17517 /* Note that these are little-endian tests.
17518 We correct for big-endian later. */
17519 poly_uint64 first
= d
->perm
[0];
17520 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
17521 || !d
->perm
.series_p (0, 2, first
, 1)
17522 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
17524 high
= maybe_ne (first
, 0U);
17532 /* We don't need a big-endian lane correction for SVE; see the comment
17533 at the head of aarch64-sve.md for details. */
17534 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
17536 x
= in0
, in0
= in1
, in1
= x
;
17541 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
17542 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
17546 /* Recognize patterns for the EXT insn. */
17549 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
17551 HOST_WIDE_INT location
;
17554 /* The first element always refers to the first vector.
17555 Check if the extracted indices are increasing by one. */
17556 if (d
->vec_flags
== VEC_SVE_PRED
17557 || !d
->perm
[0].is_constant (&location
)
17558 || !d
->perm
.series_p (0, 1, location
, 1))
17565 /* The case where (location == 0) is a no-op for both big- and little-endian,
17566 and is removed by the mid-end at optimization levels -O1 and higher.
17568 We don't need a big-endian lane correction for SVE; see the comment
17569 at the head of aarch64-sve.md for details. */
17570 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
17572 /* After setup, we want the high elements of the first vector (stored
17573 at the LSB end of the register), and the low elements of the second
17574 vector (stored at the MSB end of the register). So swap. */
17575 std::swap (d
->op0
, d
->op1
);
17576 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17577 to_constant () is safe since this is restricted to Advanced SIMD
17579 location
= d
->perm
.length ().to_constant () - location
;
17582 offset
= GEN_INT (location
);
17583 emit_set_insn (d
->target
,
17584 gen_rtx_UNSPEC (d
->vmode
,
17585 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
17590 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17591 within each 64-bit, 32-bit or 16-bit granule. */
17594 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
17596 HOST_WIDE_INT diff
;
17597 unsigned int i
, size
, unspec
;
17598 machine_mode pred_mode
;
17600 if (d
->vec_flags
== VEC_SVE_PRED
17601 || !d
->one_vector_p
17602 || !d
->perm
[0].is_constant (&diff
))
17605 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
17608 unspec
= UNSPEC_REV64
;
17609 pred_mode
= VNx2BImode
;
17611 else if (size
== 4)
17613 unspec
= UNSPEC_REV32
;
17614 pred_mode
= VNx4BImode
;
17616 else if (size
== 2)
17618 unspec
= UNSPEC_REV16
;
17619 pred_mode
= VNx8BImode
;
17624 unsigned int step
= diff
+ 1;
17625 for (i
= 0; i
< step
; ++i
)
17626 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
17633 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
17634 if (d
->vec_flags
== VEC_SVE_DATA
)
17636 rtx pred
= aarch64_ptrue_reg (pred_mode
);
17637 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
17640 emit_set_insn (d
->target
, src
);
17644 /* Recognize patterns for the REV insn, which reverses elements within
17648 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
17650 poly_uint64 nelt
= d
->perm
.length ();
17652 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
17655 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
17662 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
17663 emit_set_insn (d
->target
, src
);
17668 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
17670 rtx out
= d
->target
;
17673 machine_mode vmode
= d
->vmode
;
17676 if (d
->vec_flags
== VEC_SVE_PRED
17677 || d
->perm
.encoding ().encoded_nelts () != 1
17678 || !d
->perm
[0].is_constant (&elt
))
17681 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
17688 /* The generic preparation in aarch64_expand_vec_perm_const_1
17689 swaps the operand order and the permute indices if it finds
17690 d->perm[0] to be in the second operand. Thus, we can always
17691 use d->op0 and need not do any extra arithmetic to get the
17692 correct lane number. */
17694 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
17696 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
17697 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
17698 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
17703 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
17705 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
17706 machine_mode vmode
= d
->vmode
;
17708 /* Make sure that the indices are constant. */
17709 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
17710 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
17711 if (!d
->perm
[i
].is_constant ())
17717 /* Generic code will try constant permutation twice. Once with the
17718 original mode and again with the elements lowered to QImode.
17719 So wait and don't do the selector expansion ourselves. */
17720 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
17723 /* to_constant is safe since this routine is specific to Advanced SIMD
17725 unsigned int nelt
= d
->perm
.length ().to_constant ();
17726 for (unsigned int i
= 0; i
< nelt
; ++i
)
17727 /* If big-endian and two vectors we end up with a weird mixed-endian
17728 mode on NEON. Reverse the index within each word but not the word
17729 itself. to_constant is safe because we checked is_constant above. */
17730 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
17731 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
17732 : d
->perm
[i
].to_constant ());
17734 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
17735 sel
= force_reg (vmode
, sel
);
17737 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
17741 /* Try to implement D using an SVE TBL instruction. */
17744 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
17746 unsigned HOST_WIDE_INT nelt
;
17748 /* Permuting two variable-length vectors could overflow the
17750 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
17756 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
17757 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
17758 if (d
->one_vector_p
)
17759 emit_unspec2 (d
->target
, UNSPEC_TBL
, d
->op0
, force_reg (sel_mode
, sel
));
17761 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
17766 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
17768 /* The pattern matching functions above are written to look for a small
17769 number to begin the sequence (0, 1, N/2). If we begin with an index
17770 from the second operand, we can swap the operands. */
17771 poly_int64 nelt
= d
->perm
.length ();
17772 if (known_ge (d
->perm
[0], nelt
))
17774 d
->perm
.rotate_inputs (1);
17775 std::swap (d
->op0
, d
->op1
);
17778 if ((d
->vec_flags
== VEC_ADVSIMD
17779 || d
->vec_flags
== VEC_SVE_DATA
17780 || d
->vec_flags
== VEC_SVE_PRED
)
17781 && known_gt (nelt
, 1))
17783 if (aarch64_evpc_rev_local (d
))
17785 else if (aarch64_evpc_rev_global (d
))
17787 else if (aarch64_evpc_ext (d
))
17789 else if (aarch64_evpc_dup (d
))
17791 else if (aarch64_evpc_zip (d
))
17793 else if (aarch64_evpc_uzp (d
))
17795 else if (aarch64_evpc_trn (d
))
17797 if (d
->vec_flags
== VEC_SVE_DATA
)
17798 return aarch64_evpc_sve_tbl (d
);
17799 else if (d
->vec_flags
== VEC_ADVSIMD
)
17800 return aarch64_evpc_tbl (d
);
17805 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17808 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
17809 rtx op1
, const vec_perm_indices
&sel
)
17811 struct expand_vec_perm_d d
;
17813 /* Check whether the mask can be applied to a single vector. */
17814 if (sel
.ninputs () == 1
17815 || (op0
&& rtx_equal_p (op0
, op1
)))
17816 d
.one_vector_p
= true;
17817 else if (sel
.all_from_input_p (0))
17819 d
.one_vector_p
= true;
17822 else if (sel
.all_from_input_p (1))
17824 d
.one_vector_p
= true;
17828 d
.one_vector_p
= false;
17830 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
17831 sel
.nelts_per_input ());
17833 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
17837 d
.testing_p
= !target
;
17840 return aarch64_expand_vec_perm_const_1 (&d
);
17842 rtx_insn
*last
= get_last_insn ();
17843 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
17844 gcc_assert (last
== get_last_insn ());
17849 /* Generate a byte permute mask for a register of mode MODE,
17850 which has NUNITS units. */
17853 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
17855 /* We have to reverse each vector because we dont have
17856 a permuted load that can reverse-load according to ABI rules. */
17858 rtvec v
= rtvec_alloc (16);
17860 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
17862 gcc_assert (BYTES_BIG_ENDIAN
);
17863 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
17865 for (i
= 0; i
< nunits
; i
++)
17866 for (j
= 0; j
< usize
; j
++)
17867 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
17868 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
17869 return force_reg (V16QImode
, mask
);
17872 /* Expand an SVE integer comparison using the SVE equivalent of:
17874 (set TARGET (CODE OP0 OP1)). */
17877 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
17879 machine_mode pred_mode
= GET_MODE (target
);
17880 machine_mode data_mode
= GET_MODE (op0
);
17881 rtx res
= aarch64_sve_emit_int_cmp (target
, pred_mode
, code
, data_mode
,
17883 if (!rtx_equal_p (target
, res
))
17884 emit_move_insn (target
, res
);
17887 /* Return the UNSPEC_COND_* code for comparison CODE. */
17889 static unsigned int
17890 aarch64_unspec_cond_code (rtx_code code
)
17895 return UNSPEC_COND_FCMNE
;
17897 return UNSPEC_COND_FCMEQ
;
17899 return UNSPEC_COND_FCMLT
;
17901 return UNSPEC_COND_FCMGT
;
17903 return UNSPEC_COND_FCMLE
;
17905 return UNSPEC_COND_FCMGE
;
17907 return UNSPEC_COND_FCMUO
;
17909 gcc_unreachable ();
17915 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17917 where <X> is the operation associated with comparison CODE.
17918 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17921 aarch64_emit_sve_fp_cond (rtx target
, rtx_code code
, rtx pred
,
17922 bool known_ptrue_p
, rtx op0
, rtx op1
)
17924 rtx flag
= gen_int_mode (known_ptrue_p
, SImode
);
17925 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
17926 gen_rtvec (4, pred
, flag
, op0
, op1
),
17927 aarch64_unspec_cond_code (code
));
17928 emit_set_insn (target
, unspec
);
17931 /* Emit the SVE equivalent of:
17933 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
17934 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
17935 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17937 where <Xi> is the operation associated with comparison CODEi.
17938 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17941 aarch64_emit_sve_or_fp_conds (rtx target
, rtx_code code1
, rtx_code code2
,
17942 rtx pred
, bool known_ptrue_p
, rtx op0
, rtx op1
)
17944 machine_mode pred_mode
= GET_MODE (pred
);
17945 rtx tmp1
= gen_reg_rtx (pred_mode
);
17946 aarch64_emit_sve_fp_cond (tmp1
, code1
, pred
, known_ptrue_p
, op0
, op1
);
17947 rtx tmp2
= gen_reg_rtx (pred_mode
);
17948 aarch64_emit_sve_fp_cond (tmp2
, code2
, pred
, known_ptrue_p
, op0
, op1
);
17949 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
17952 /* Emit the SVE equivalent of:
17954 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17955 (set TARGET (not TMP))
17957 where <X> is the operation associated with comparison CODE.
17958 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17961 aarch64_emit_sve_invert_fp_cond (rtx target
, rtx_code code
, rtx pred
,
17962 bool known_ptrue_p
, rtx op0
, rtx op1
)
17964 machine_mode pred_mode
= GET_MODE (pred
);
17965 rtx tmp
= gen_reg_rtx (pred_mode
);
17966 aarch64_emit_sve_fp_cond (tmp
, code
, pred
, known_ptrue_p
, op0
, op1
);
17967 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
17970 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17972 (set TARGET (CODE OP0 OP1))
17974 If CAN_INVERT_P is true, the caller can also handle inverted results;
17975 return true if the result is in fact inverted. */
17978 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
17979 rtx op0
, rtx op1
, bool can_invert_p
)
17981 machine_mode pred_mode
= GET_MODE (target
);
17982 machine_mode data_mode
= GET_MODE (op0
);
17984 rtx ptrue
= aarch64_ptrue_reg (pred_mode
);
17988 /* UNORDERED has no immediate form. */
17989 op1
= force_reg (data_mode
, op1
);
17998 /* There is native support for the comparison. */
17999 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18004 /* This is a trapping operation (LT or GT). */
18005 aarch64_emit_sve_or_fp_conds (target
, LT
, GT
, ptrue
, true, op0
, op1
);
18009 if (!flag_trapping_math
)
18011 /* This would trap for signaling NaNs. */
18012 op1
= force_reg (data_mode
, op1
);
18013 aarch64_emit_sve_or_fp_conds (target
, UNORDERED
, EQ
,
18014 ptrue
, true, op0
, op1
);
18022 if (flag_trapping_math
)
18024 /* Work out which elements are ordered. */
18025 rtx ordered
= gen_reg_rtx (pred_mode
);
18026 op1
= force_reg (data_mode
, op1
);
18027 aarch64_emit_sve_invert_fp_cond (ordered
, UNORDERED
,
18028 ptrue
, true, op0
, op1
);
18030 /* Test the opposite condition for the ordered elements,
18031 then invert the result. */
18035 code
= reverse_condition_maybe_unordered (code
);
18038 aarch64_emit_sve_fp_cond (target
, code
,
18039 ordered
, false, op0
, op1
);
18042 aarch64_emit_sve_invert_fp_cond (target
, code
,
18043 ordered
, false, op0
, op1
);
18049 /* ORDERED has no immediate form. */
18050 op1
= force_reg (data_mode
, op1
);
18054 gcc_unreachable ();
18057 /* There is native support for the inverse comparison. */
18058 code
= reverse_condition_maybe_unordered (code
);
18061 aarch64_emit_sve_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18064 aarch64_emit_sve_invert_fp_cond (target
, code
, ptrue
, true, op0
, op1
);
18068 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18069 of the data being selected and CMP_MODE is the mode of the values being
18073 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
18076 machine_mode pred_mode
18077 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
18078 GET_MODE_SIZE (cmp_mode
)).require ();
18079 rtx pred
= gen_reg_rtx (pred_mode
);
18080 if (FLOAT_MODE_P (cmp_mode
))
18082 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
18083 ops
[4], ops
[5], true))
18084 std::swap (ops
[1], ops
[2]);
18087 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
18089 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
18090 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
18093 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18094 true. However due to issues with register allocation it is preferable
18095 to avoid tieing integer scalar and FP scalar modes. Executing integer
18096 operations in general registers is better than treating them as scalar
18097 vector operations. This reduces latency and avoids redundant int<->FP
18098 moves. So tie modes if they are either the same class, or vector modes
18099 with other vector modes, vector structs or any scalar mode. */
18102 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
18104 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
18107 /* We specifically want to allow elements of "structure" modes to
18108 be tieable to the structure. This more general condition allows
18109 other rarer situations too. The reason we don't extend this to
18110 predicate modes is that there are no predicate structure modes
18111 nor any specific instructions for extracting part of a predicate
18113 if (aarch64_vector_data_mode_p (mode1
)
18114 && aarch64_vector_data_mode_p (mode2
))
18117 /* Also allow any scalar modes with vectors. */
18118 if (aarch64_vector_mode_supported_p (mode1
)
18119 || aarch64_vector_mode_supported_p (mode2
))
18125 /* Return a new RTX holding the result of moving POINTER forward by
18129 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
18131 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
18133 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
18137 /* Return a new RTX holding the result of moving POINTER forward by the
18138 size of the mode it points to. */
18141 aarch64_progress_pointer (rtx pointer
)
18143 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
18146 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18150 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
18153 rtx reg
= gen_reg_rtx (mode
);
18155 /* "Cast" the pointers to the correct mode. */
18156 *src
= adjust_address (*src
, mode
, 0);
18157 *dst
= adjust_address (*dst
, mode
, 0);
18158 /* Emit the memcpy. */
18159 emit_move_insn (reg
, *src
);
18160 emit_move_insn (*dst
, reg
);
18161 /* Move the pointers forward. */
18162 *src
= aarch64_progress_pointer (*src
);
18163 *dst
= aarch64_progress_pointer (*dst
);
18166 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18167 we succeed, otherwise return false. */
18170 aarch64_expand_cpymem (rtx
*operands
)
18173 rtx dst
= operands
[0];
18174 rtx src
= operands
[1];
18176 machine_mode cur_mode
= BLKmode
, next_mode
;
18177 bool speed_p
= !optimize_function_for_size_p (cfun
);
18179 /* When optimizing for size, give a better estimate of the length of a
18180 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18181 will always require an even number of instructions to do now. And each
18182 operation requires both a load+store, so devide the max number by 2. */
18183 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
18185 /* We can't do anything smart if the amount to copy is not constant. */
18186 if (!CONST_INT_P (operands
[2]))
18189 n
= INTVAL (operands
[2]);
18191 /* Try to keep the number of instructions low. For all cases we will do at
18192 most two moves for the residual amount, since we'll always overlap the
18194 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
18197 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
18198 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
18200 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
18201 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
18203 /* Convert n to bits to make the rest of the code simpler. */
18204 n
= n
* BITS_PER_UNIT
;
18206 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18207 larger than TImode, but we should not use them for loads/stores here. */
18208 const int copy_limit
= GET_MODE_BITSIZE (TImode
);
18212 /* Find the largest mode in which to do the copy in without over reading
18214 opt_scalar_int_mode mode_iter
;
18215 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
18216 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= MIN (n
, copy_limit
))
18217 cur_mode
= mode_iter
.require ();
18219 gcc_assert (cur_mode
!= BLKmode
);
18221 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
18222 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
18226 /* Do certain trailing copies as overlapping if it's going to be
18227 cheaper. i.e. less instructions to do so. For instance doing a 15
18228 byte copy it's more efficient to do two overlapping 8 byte copies than
18230 if (n
> 0 && n
<= 8 * BITS_PER_UNIT
)
18232 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
18233 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
18234 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
18235 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
18243 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18244 SImode stores. Handle the case when the constant has identical
18245 bottom and top halves. This is beneficial when the two stores can be
18246 merged into an STP and we avoid synthesising potentially expensive
18247 immediates twice. Return true if such a split is possible. */
18250 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
18252 rtx lo
= gen_lowpart (SImode
, src
);
18253 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
18255 bool size_p
= optimize_function_for_size_p (cfun
);
18257 if (!rtx_equal_p (lo
, hi
))
18260 unsigned int orig_cost
18261 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
18262 unsigned int lo_cost
18263 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
18265 /* We want to transform:
18267 MOVK x1, 0x140, lsl 16
18268 MOVK x1, 0xc0da, lsl 32
18269 MOVK x1, 0x140, lsl 48
18273 MOVK w1, 0x140, lsl 16
18275 So we want to perform this only when we save two instructions
18276 or more. When optimizing for size, however, accept any code size
18278 if (size_p
&& orig_cost
<= lo_cost
)
18282 && (orig_cost
<= lo_cost
+ 1))
18285 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
18286 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
18289 rtx tmp_reg
= gen_reg_rtx (SImode
);
18290 aarch64_expand_mov_immediate (tmp_reg
, lo
);
18291 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
18292 /* Don't emit an explicit store pair as this may not be always profitable.
18293 Let the sched-fusion logic decide whether to merge them. */
18294 emit_move_insn (mem_lo
, tmp_reg
);
18295 emit_move_insn (mem_hi
, tmp_reg
);
18300 /* Generate RTL for a conditional branch with rtx comparison CODE in
18301 mode CC_MODE. The destination of the unlikely conditional branch
18305 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
18309 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
18310 gen_rtx_REG (cc_mode
, CC_REGNUM
),
18313 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
18314 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
18316 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
18319 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18321 OP1 represents the TImode destination operand 1
18322 OP2 represents the TImode destination operand 2
18323 LOW_DEST represents the low half (DImode) of TImode operand 0
18324 LOW_IN1 represents the low half (DImode) of TImode operand 1
18325 LOW_IN2 represents the low half (DImode) of TImode operand 2
18326 HIGH_DEST represents the high half (DImode) of TImode operand 0
18327 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18328 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18331 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18332 rtx
*low_in1
, rtx
*low_in2
,
18333 rtx
*high_dest
, rtx
*high_in1
,
18336 *low_dest
= gen_reg_rtx (DImode
);
18337 *low_in1
= gen_lowpart (DImode
, op1
);
18338 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18339 subreg_lowpart_offset (DImode
, TImode
));
18340 *high_dest
= gen_reg_rtx (DImode
);
18341 *high_in1
= gen_highpart (DImode
, op1
);
18342 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18343 subreg_highpart_offset (DImode
, TImode
));
18346 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18348 This function differs from 'arch64_addti_scratch_regs' in that
18349 OP1 can be an immediate constant (zero). We must call
18350 subreg_highpart_offset with DImode and TImode arguments, otherwise
18351 VOIDmode will be used for the const_int which generates an internal
18352 error from subreg_size_highpart_offset which does not expect a size of zero.
18354 OP1 represents the TImode destination operand 1
18355 OP2 represents the TImode destination operand 2
18356 LOW_DEST represents the low half (DImode) of TImode operand 0
18357 LOW_IN1 represents the low half (DImode) of TImode operand 1
18358 LOW_IN2 represents the low half (DImode) of TImode operand 2
18359 HIGH_DEST represents the high half (DImode) of TImode operand 0
18360 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18361 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18365 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
18366 rtx
*low_in1
, rtx
*low_in2
,
18367 rtx
*high_dest
, rtx
*high_in1
,
18370 *low_dest
= gen_reg_rtx (DImode
);
18371 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18372 subreg_lowpart_offset (DImode
, TImode
));
18374 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18375 subreg_lowpart_offset (DImode
, TImode
));
18376 *high_dest
= gen_reg_rtx (DImode
);
18378 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
18379 subreg_highpart_offset (DImode
, TImode
));
18380 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
18381 subreg_highpart_offset (DImode
, TImode
));
18384 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18386 OP0 represents the TImode destination operand 0
18387 LOW_DEST represents the low half (DImode) of TImode operand 0
18388 LOW_IN1 represents the low half (DImode) of TImode operand 1
18389 LOW_IN2 represents the low half (DImode) of TImode operand 2
18390 HIGH_DEST represents the high half (DImode) of TImode operand 0
18391 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18392 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18393 UNSIGNED_P is true if the operation is being performed on unsigned
18396 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
18397 rtx low_in2
, rtx high_dest
, rtx high_in1
,
18398 rtx high_in2
, bool unsigned_p
)
18400 if (low_in2
== const0_rtx
)
18402 low_dest
= low_in1
;
18403 high_in2
= force_reg (DImode
, high_in2
);
18405 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
, high_in2
));
18407 emit_insn (gen_subvdi_insn (high_dest
, high_in1
, high_in2
));
18411 if (CONST_INT_P (low_in2
))
18413 high_in2
= force_reg (DImode
, high_in2
);
18414 emit_insn (gen_subdi3_compare1_imm (low_dest
, low_in1
, low_in2
,
18415 GEN_INT (-INTVAL (low_in2
))));
18418 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
18421 emit_insn (gen_usubdi3_carryinC (high_dest
, high_in1
, high_in2
));
18423 emit_insn (gen_subdi3_carryinV (high_dest
, high_in1
, high_in2
));
18426 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
18427 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
18431 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18433 static unsigned HOST_WIDE_INT
18434 aarch64_asan_shadow_offset (void)
18437 return (HOST_WIDE_INT_1
<< 29);
18439 return (HOST_WIDE_INT_1
<< 36);
18443 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
18444 int code
, tree treeop0
, tree treeop1
)
18446 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18448 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18450 struct expand_operand ops
[4];
18453 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18455 op_mode
= GET_MODE (op0
);
18456 if (op_mode
== VOIDmode
)
18457 op_mode
= GET_MODE (op1
);
18465 icode
= CODE_FOR_cmpsi
;
18470 icode
= CODE_FOR_cmpdi
;
18475 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18476 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
18481 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
18482 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
18490 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
18491 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
18497 *prep_seq
= get_insns ();
18500 create_fixed_operand (&ops
[0], op0
);
18501 create_fixed_operand (&ops
[1], op1
);
18504 if (!maybe_expand_insn (icode
, 2, ops
))
18509 *gen_seq
= get_insns ();
18512 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
18513 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
18517 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
18518 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
18520 rtx op0
, op1
, target
;
18521 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
18522 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
18524 struct expand_operand ops
[6];
18527 push_to_sequence (*prep_seq
);
18528 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
18530 op_mode
= GET_MODE (op0
);
18531 if (op_mode
== VOIDmode
)
18532 op_mode
= GET_MODE (op1
);
18540 icode
= CODE_FOR_ccmpsi
;
18545 icode
= CODE_FOR_ccmpdi
;
18550 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18551 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
18556 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
18557 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
18565 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
18566 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
18572 *prep_seq
= get_insns ();
18575 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
18576 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
18578 if (bit_code
!= AND
)
18580 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
18581 GET_MODE (XEXP (prev
, 0))),
18582 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
18583 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
18586 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
18587 create_fixed_operand (&ops
[1], target
);
18588 create_fixed_operand (&ops
[2], op0
);
18589 create_fixed_operand (&ops
[3], op1
);
18590 create_fixed_operand (&ops
[4], prev
);
18591 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
18593 push_to_sequence (*gen_seq
);
18594 if (!maybe_expand_insn (icode
, 6, ops
))
18600 *gen_seq
= get_insns ();
18603 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
18606 #undef TARGET_GEN_CCMP_FIRST
18607 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18609 #undef TARGET_GEN_CCMP_NEXT
18610 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18612 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18613 instruction fusion of some sort. */
18616 aarch64_macro_fusion_p (void)
18618 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
18622 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18623 should be kept together during scheduling. */
18626 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
18629 rtx prev_set
= single_set (prev
);
18630 rtx curr_set
= single_set (curr
);
18631 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18632 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
18634 if (!aarch64_macro_fusion_p ())
18637 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
18639 /* We are trying to match:
18640 prev (mov) == (set (reg r0) (const_int imm16))
18641 curr (movk) == (set (zero_extract (reg r0)
18644 (const_int imm16_1)) */
18646 set_dest
= SET_DEST (curr_set
);
18648 if (GET_CODE (set_dest
) == ZERO_EXTRACT
18649 && CONST_INT_P (SET_SRC (curr_set
))
18650 && CONST_INT_P (SET_SRC (prev_set
))
18651 && CONST_INT_P (XEXP (set_dest
, 2))
18652 && INTVAL (XEXP (set_dest
, 2)) == 16
18653 && REG_P (XEXP (set_dest
, 0))
18654 && REG_P (SET_DEST (prev_set
))
18655 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
18661 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
18664 /* We're trying to match:
18665 prev (adrp) == (set (reg r1)
18666 (high (symbol_ref ("SYM"))))
18667 curr (add) == (set (reg r0)
18669 (symbol_ref ("SYM"))))
18670 Note that r0 need not necessarily be the same as r1, especially
18671 during pre-regalloc scheduling. */
18673 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18674 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18676 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
18677 && REG_P (XEXP (SET_SRC (curr_set
), 0))
18678 && REGNO (XEXP (SET_SRC (curr_set
), 0))
18679 == REGNO (SET_DEST (prev_set
))
18680 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
18681 XEXP (SET_SRC (curr_set
), 1)))
18686 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
18689 /* We're trying to match:
18690 prev (movk) == (set (zero_extract (reg r0)
18693 (const_int imm16_1))
18694 curr (movk) == (set (zero_extract (reg r0)
18697 (const_int imm16_2)) */
18699 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
18700 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
18701 && REG_P (XEXP (SET_DEST (prev_set
), 0))
18702 && REG_P (XEXP (SET_DEST (curr_set
), 0))
18703 && REGNO (XEXP (SET_DEST (prev_set
), 0))
18704 == REGNO (XEXP (SET_DEST (curr_set
), 0))
18705 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
18706 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
18707 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
18708 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
18709 && CONST_INT_P (SET_SRC (prev_set
))
18710 && CONST_INT_P (SET_SRC (curr_set
)))
18714 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
18716 /* We're trying to match:
18717 prev (adrp) == (set (reg r0)
18718 (high (symbol_ref ("SYM"))))
18719 curr (ldr) == (set (reg r1)
18720 (mem (lo_sum (reg r0)
18721 (symbol_ref ("SYM")))))
18723 curr (ldr) == (set (reg r1)
18726 (symbol_ref ("SYM")))))) */
18727 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
18728 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
18730 rtx curr_src
= SET_SRC (curr_set
);
18732 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
18733 curr_src
= XEXP (curr_src
, 0);
18735 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
18736 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
18737 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
18738 == REGNO (SET_DEST (prev_set
))
18739 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
18740 XEXP (SET_SRC (prev_set
), 0)))
18745 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
18746 && any_condjump_p (curr
))
18748 unsigned int condreg1
, condreg2
;
18750 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
18751 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
18753 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
18755 && modified_in_p (cc_reg_1
, prev
))
18757 enum attr_type prev_type
= get_attr_type (prev
);
18759 /* FIXME: this misses some which is considered simple arthematic
18760 instructions for ThunderX. Simple shifts are missed here. */
18761 if (prev_type
== TYPE_ALUS_SREG
18762 || prev_type
== TYPE_ALUS_IMM
18763 || prev_type
== TYPE_LOGICS_REG
18764 || prev_type
== TYPE_LOGICS_IMM
)
18771 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
18772 && any_condjump_p (curr
))
18774 /* We're trying to match:
18775 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18776 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18778 (label_ref ("SYM"))
18780 if (SET_DEST (curr_set
) == (pc_rtx
)
18781 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
18782 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
18783 && REG_P (SET_DEST (prev_set
))
18784 && REGNO (SET_DEST (prev_set
))
18785 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
18787 /* Fuse ALU operations followed by conditional branch instruction. */
18788 switch (get_attr_type (prev
))
18791 case TYPE_ALU_SREG
:
18794 case TYPE_ADCS_REG
:
18795 case TYPE_ADCS_IMM
:
18796 case TYPE_LOGIC_REG
:
18797 case TYPE_LOGIC_IMM
:
18801 case TYPE_SHIFT_REG
:
18802 case TYPE_SHIFT_IMM
:
18817 /* Return true iff the instruction fusion described by OP is enabled. */
18820 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
18822 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
18825 /* If MEM is in the form of [base+offset], extract the two parts
18826 of address and set to BASE and OFFSET, otherwise return false
18827 after clearing BASE and OFFSET. */
18830 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
18834 gcc_assert (MEM_P (mem
));
18836 addr
= XEXP (mem
, 0);
18841 *offset
= const0_rtx
;
18845 if (GET_CODE (addr
) == PLUS
18846 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
18848 *base
= XEXP (addr
, 0);
18849 *offset
= XEXP (addr
, 1);
18854 *offset
= NULL_RTX
;
18859 /* Types for scheduling fusion. */
18860 enum sched_fusion_type
18862 SCHED_FUSION_NONE
= 0,
18863 SCHED_FUSION_LD_SIGN_EXTEND
,
18864 SCHED_FUSION_LD_ZERO_EXTEND
,
18870 /* If INSN is a load or store of address in the form of [base+offset],
18871 extract the two parts and set to BASE and OFFSET. Return scheduling
18872 fusion type this INSN is. */
18874 static enum sched_fusion_type
18875 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
18878 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
18880 gcc_assert (INSN_P (insn
));
18881 x
= PATTERN (insn
);
18882 if (GET_CODE (x
) != SET
)
18883 return SCHED_FUSION_NONE
;
18886 dest
= SET_DEST (x
);
18888 machine_mode dest_mode
= GET_MODE (dest
);
18890 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
18891 return SCHED_FUSION_NONE
;
18893 if (GET_CODE (src
) == SIGN_EXTEND
)
18895 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
18896 src
= XEXP (src
, 0);
18897 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18898 return SCHED_FUSION_NONE
;
18900 else if (GET_CODE (src
) == ZERO_EXTEND
)
18902 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
18903 src
= XEXP (src
, 0);
18904 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
18905 return SCHED_FUSION_NONE
;
18908 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
18909 extract_base_offset_in_addr (src
, base
, offset
);
18910 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
18912 fusion
= SCHED_FUSION_ST
;
18913 extract_base_offset_in_addr (dest
, base
, offset
);
18916 return SCHED_FUSION_NONE
;
18918 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
18919 fusion
= SCHED_FUSION_NONE
;
18924 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18926 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18927 and PRI are only calculated for these instructions. For other instruction,
18928 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18929 type instruction fusion can be added by returning different priorities.
18931 It's important that irrelevant instructions get the largest FUSION_PRI. */
18934 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
18935 int *fusion_pri
, int *pri
)
18939 enum sched_fusion_type fusion
;
18941 gcc_assert (INSN_P (insn
));
18944 fusion
= fusion_load_store (insn
, &base
, &offset
);
18945 if (fusion
== SCHED_FUSION_NONE
)
18952 /* Set FUSION_PRI according to fusion type and base register. */
18953 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
18955 /* Calculate PRI. */
18958 /* INSN with smaller offset goes first. */
18959 off_val
= (int)(INTVAL (offset
));
18961 tmp
-= (off_val
& 0xfffff);
18963 tmp
+= ((- off_val
) & 0xfffff);
18969 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18970 Adjust priority of sha1h instructions so they are scheduled before
18971 other SHA1 instructions. */
18974 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
18976 rtx x
= PATTERN (insn
);
18978 if (GET_CODE (x
) == SET
)
18982 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
18983 return priority
+ 10;
18989 /* Given OPERANDS of consecutive load/store, check if we can merge
18990 them into ldp/stp. LOAD is true if they are load instructions.
18991 MODE is the mode of memory operands. */
18994 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
18997 HOST_WIDE_INT offval_1
, offval_2
, msize
;
18998 enum reg_class rclass_1
, rclass_2
;
18999 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
19003 mem_1
= operands
[1];
19004 mem_2
= operands
[3];
19005 reg_1
= operands
[0];
19006 reg_2
= operands
[2];
19007 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
19008 if (REGNO (reg_1
) == REGNO (reg_2
))
19013 mem_1
= operands
[0];
19014 mem_2
= operands
[2];
19015 reg_1
= operands
[1];
19016 reg_2
= operands
[3];
19019 /* The mems cannot be volatile. */
19020 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
19023 /* If we have SImode and slow unaligned ldp,
19024 check the alignment to be at least 8 byte. */
19026 && (aarch64_tune_params
.extra_tuning_flags
19027 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19029 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
19032 /* Check if the addresses are in the form of [base+offset]. */
19033 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19034 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
19036 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19037 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
19040 /* Check if the bases are same. */
19041 if (!rtx_equal_p (base_1
, base_2
))
19044 /* The operands must be of the same size. */
19045 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
19046 GET_MODE_SIZE (GET_MODE (mem_2
))));
19048 offval_1
= INTVAL (offset_1
);
19049 offval_2
= INTVAL (offset_2
);
19050 /* We should only be trying this for fixed-sized modes. There is no
19051 SVE LDP/STP instruction. */
19052 msize
= GET_MODE_SIZE (mode
).to_constant ();
19053 /* Check if the offsets are consecutive. */
19054 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
19057 /* Check if the addresses are clobbered by load. */
19060 if (reg_mentioned_p (reg_1
, mem_1
))
19063 /* In increasing order, the last load can clobber the address. */
19064 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
19068 /* One of the memory accesses must be a mempair operand.
19069 If it is not the first one, they need to be swapped by the
19071 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
19072 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
19075 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
19076 rclass_1
= FP_REGS
;
19078 rclass_1
= GENERAL_REGS
;
19080 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
19081 rclass_2
= FP_REGS
;
19083 rclass_2
= GENERAL_REGS
;
19085 /* Check if the registers are of same class. */
19086 if (rclass_1
!= rclass_2
)
19092 /* Given OPERANDS of consecutive load/store that can be merged,
19093 swap them if they are not in ascending order. */
19095 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
19097 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
19098 HOST_WIDE_INT offval_1
, offval_2
;
19102 mem_1
= operands
[1];
19103 mem_2
= operands
[3];
19107 mem_1
= operands
[0];
19108 mem_2
= operands
[2];
19111 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
19112 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
19114 offval_1
= INTVAL (offset_1
);
19115 offval_2
= INTVAL (offset_2
);
19117 if (offval_1
> offval_2
)
19119 /* Irrespective of whether this is a load or a store,
19120 we do the same swap. */
19121 std::swap (operands
[0], operands
[2]);
19122 std::swap (operands
[1], operands
[3]);
19126 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19127 comparison between the two. */
19129 aarch64_host_wide_int_compare (const void *x
, const void *y
)
19131 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
19132 * ((const HOST_WIDE_INT
*) y
));
19135 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19136 other pointing to a REG rtx containing an offset, compare the offsets
19141 1 iff offset (X) > offset (Y)
19142 0 iff offset (X) == offset (Y)
19143 -1 iff offset (X) < offset (Y) */
19145 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
19147 const rtx
* operands_1
= (const rtx
*) x
;
19148 const rtx
* operands_2
= (const rtx
*) y
;
19149 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
19151 if (MEM_P (operands_1
[0]))
19152 mem_1
= operands_1
[0];
19154 mem_1
= operands_1
[1];
19156 if (MEM_P (operands_2
[0]))
19157 mem_2
= operands_2
[0];
19159 mem_2
= operands_2
[1];
19161 /* Extract the offsets. */
19162 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19163 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
19165 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
19167 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
19170 /* Given OPERANDS of consecutive load/store, check if we can merge
19171 them into ldp/stp by adjusting the offset. LOAD is true if they
19172 are load instructions. MODE is the mode of memory operands.
19174 Given below consecutive stores:
19176 str w1, [xb, 0x100]
19177 str w1, [xb, 0x104]
19178 str w1, [xb, 0x108]
19179 str w1, [xb, 0x10c]
19181 Though the offsets are out of the range supported by stp, we can
19182 still pair them after adjusting the offset, like:
19184 add scratch, xb, 0x100
19185 stp w1, w1, [scratch]
19186 stp w1, w1, [scratch, 0x8]
19188 The peephole patterns detecting this opportunity should guarantee
19189 the scratch register is avaliable. */
19192 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
19195 const int num_insns
= 4;
19196 enum reg_class rclass
;
19197 HOST_WIDE_INT offvals
[num_insns
], msize
;
19198 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
19202 for (int i
= 0; i
< num_insns
; i
++)
19204 reg
[i
] = operands
[2 * i
];
19205 mem
[i
] = operands
[2 * i
+ 1];
19207 gcc_assert (REG_P (reg
[i
]));
19210 /* Do not attempt to merge the loads if the loads clobber each other. */
19211 for (int i
= 0; i
< 8; i
+= 2)
19212 for (int j
= i
+ 2; j
< 8; j
+= 2)
19213 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
19217 for (int i
= 0; i
< num_insns
; i
++)
19219 mem
[i
] = operands
[2 * i
];
19220 reg
[i
] = operands
[2 * i
+ 1];
19223 /* Skip if memory operand is by itself valid for ldp/stp. */
19224 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
19227 for (int i
= 0; i
< num_insns
; i
++)
19229 /* The mems cannot be volatile. */
19230 if (MEM_VOLATILE_P (mem
[i
]))
19233 /* Check if the addresses are in the form of [base+offset]. */
19234 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
19235 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
19239 /* Check if the registers are of same class. */
19240 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
19241 ? FP_REGS
: GENERAL_REGS
;
19243 for (int i
= 1; i
< num_insns
; i
++)
19244 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
19246 if (rclass
!= FP_REGS
)
19251 if (rclass
!= GENERAL_REGS
)
19255 /* Only the last register in the order in which they occur
19256 may be clobbered by the load. */
19257 if (rclass
== GENERAL_REGS
&& load
)
19258 for (int i
= 0; i
< num_insns
- 1; i
++)
19259 if (reg_mentioned_p (reg
[i
], mem
[i
]))
19262 /* Check if the bases are same. */
19263 for (int i
= 0; i
< num_insns
- 1; i
++)
19264 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
19267 for (int i
= 0; i
< num_insns
; i
++)
19268 offvals
[i
] = INTVAL (offset
[i
]);
19270 msize
= GET_MODE_SIZE (mode
);
19272 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19273 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
19274 aarch64_host_wide_int_compare
);
19276 if (!(offvals
[1] == offvals
[0] + msize
19277 && offvals
[3] == offvals
[2] + msize
))
19280 /* Check that offsets are within range of each other. The ldp/stp
19281 instructions have 7 bit immediate offsets, so use 0x80. */
19282 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
19285 /* The offsets must be aligned with respect to each other. */
19286 if (offvals
[0] % msize
!= offvals
[2] % msize
)
19289 /* If we have SImode and slow unaligned ldp,
19290 check the alignment to be at least 8 byte. */
19292 && (aarch64_tune_params
.extra_tuning_flags
19293 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
19295 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
19301 /* Given OPERANDS of consecutive load/store, this function pairs them
19302 into LDP/STP after adjusting the offset. It depends on the fact
19303 that the operands can be sorted so the offsets are correct for STP.
19304 MODE is the mode of memory operands. CODE is the rtl operator
19305 which should be applied to all memory operands, it's SIGN_EXTEND,
19306 ZERO_EXTEND or UNKNOWN. */
19309 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
19310 scalar_mode mode
, RTX_CODE code
)
19312 rtx base
, offset_1
, offset_3
, t1
, t2
;
19313 rtx mem_1
, mem_2
, mem_3
, mem_4
;
19314 rtx temp_operands
[8];
19315 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
19316 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
19318 /* We make changes on a copy as we may still bail out. */
19319 for (int i
= 0; i
< 8; i
++)
19320 temp_operands
[i
] = operands
[i
];
19322 /* Sort the operands. */
19323 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
19325 /* Copy the memory operands so that if we have to bail for some
19326 reason the original addresses are unchanged. */
19329 mem_1
= copy_rtx (temp_operands
[1]);
19330 mem_2
= copy_rtx (temp_operands
[3]);
19331 mem_3
= copy_rtx (temp_operands
[5]);
19332 mem_4
= copy_rtx (temp_operands
[7]);
19336 mem_1
= copy_rtx (temp_operands
[0]);
19337 mem_2
= copy_rtx (temp_operands
[2]);
19338 mem_3
= copy_rtx (temp_operands
[4]);
19339 mem_4
= copy_rtx (temp_operands
[6]);
19340 gcc_assert (code
== UNKNOWN
);
19343 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
19344 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
19345 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
19346 && offset_3
!= NULL_RTX
);
19348 /* Adjust offset so it can fit in LDP/STP instruction. */
19349 msize
= GET_MODE_SIZE (mode
);
19350 stp_off_upper_limit
= msize
* (0x40 - 1);
19351 stp_off_lower_limit
= - msize
* 0x40;
19353 off_val_1
= INTVAL (offset_1
);
19354 off_val_3
= INTVAL (offset_3
);
19356 /* The base offset is optimally half way between the two STP/LDP offsets. */
19358 base_off
= (off_val_1
+ off_val_3
) / 2;
19360 /* However, due to issues with negative LDP/STP offset generation for
19361 larger modes, for DF, DI and vector modes. we must not use negative
19362 addresses smaller than 9 signed unadjusted bits can store. This
19363 provides the most range in this case. */
19364 base_off
= off_val_1
;
19366 /* Adjust the base so that it is aligned with the addresses but still
19368 if (base_off
% msize
!= off_val_1
% msize
)
19369 /* Fix the offset, bearing in mind we want to make it bigger not
19371 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19372 else if (msize
<= 4)
19373 /* The negative range of LDP/STP is one larger than the positive range. */
19376 /* Check if base offset is too big or too small. We can attempt to resolve
19377 this issue by setting it to the maximum value and seeing if the offsets
19379 if (base_off
>= 0x1000)
19381 base_off
= 0x1000 - 1;
19382 /* We must still make sure that the base offset is aligned with respect
19383 to the address. But it may may not be made any bigger. */
19384 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19387 /* Likewise for the case where the base is too small. */
19388 if (base_off
<= -0x1000)
19390 base_off
= -0x1000 + 1;
19391 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
19394 /* Offset of the first STP/LDP. */
19395 new_off_1
= off_val_1
- base_off
;
19397 /* Offset of the second STP/LDP. */
19398 new_off_3
= off_val_3
- base_off
;
19400 /* The offsets must be within the range of the LDP/STP instructions. */
19401 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
19402 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
19405 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
19407 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
19408 new_off_1
+ msize
), true);
19409 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
19411 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
19412 new_off_3
+ msize
), true);
19414 if (!aarch64_mem_pair_operand (mem_1
, mode
)
19415 || !aarch64_mem_pair_operand (mem_3
, mode
))
19418 if (code
== ZERO_EXTEND
)
19420 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
19421 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
19422 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
19423 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
19425 else if (code
== SIGN_EXTEND
)
19427 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
19428 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
19429 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
19430 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
19435 operands
[0] = temp_operands
[0];
19436 operands
[1] = mem_1
;
19437 operands
[2] = temp_operands
[2];
19438 operands
[3] = mem_2
;
19439 operands
[4] = temp_operands
[4];
19440 operands
[5] = mem_3
;
19441 operands
[6] = temp_operands
[6];
19442 operands
[7] = mem_4
;
19446 operands
[0] = mem_1
;
19447 operands
[1] = temp_operands
[1];
19448 operands
[2] = mem_2
;
19449 operands
[3] = temp_operands
[3];
19450 operands
[4] = mem_3
;
19451 operands
[5] = temp_operands
[5];
19452 operands
[6] = mem_4
;
19453 operands
[7] = temp_operands
[7];
19456 /* Emit adjusting instruction. */
19457 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
19458 /* Emit ldp/stp instructions. */
19459 t1
= gen_rtx_SET (operands
[0], operands
[1]);
19460 t2
= gen_rtx_SET (operands
[2], operands
[3]);
19461 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19462 t1
= gen_rtx_SET (operands
[4], operands
[5]);
19463 t2
= gen_rtx_SET (operands
[6], operands
[7]);
19464 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
19468 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19469 it isn't worth branching around empty masked ops (including masked
19473 aarch64_empty_mask_is_expensive (unsigned)
19478 /* Return 1 if pseudo register should be created and used to hold
19479 GOT address for PIC code. */
19482 aarch64_use_pseudo_pic_reg (void)
19484 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
19487 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19490 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
19492 switch (XINT (x
, 1))
19494 case UNSPEC_GOTSMALLPIC
:
19495 case UNSPEC_GOTSMALLPIC28K
:
19496 case UNSPEC_GOTTINYPIC
:
19502 return default_unspec_may_trap_p (x
, flags
);
19506 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19507 return the log2 of that value. Otherwise return -1. */
19510 aarch64_fpconst_pow_of_2 (rtx x
)
19512 const REAL_VALUE_TYPE
*r
;
19514 if (!CONST_DOUBLE_P (x
))
19517 r
= CONST_DOUBLE_REAL_VALUE (x
);
19519 if (REAL_VALUE_NEGATIVE (*r
)
19520 || REAL_VALUE_ISNAN (*r
)
19521 || REAL_VALUE_ISINF (*r
)
19522 || !real_isinteger (r
, DFmode
))
19525 return exact_log2 (real_to_integer (r
));
19528 /* If X is a vector of equal CONST_DOUBLE values and that value is
19529 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19532 aarch64_vec_fpconst_pow_of_2 (rtx x
)
19535 if (GET_CODE (x
) != CONST_VECTOR
19536 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
19539 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
19542 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
19546 for (int i
= 1; i
< nelts
; i
++)
19547 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
19553 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19556 __fp16 always promotes through this hook.
19557 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19558 through the generic excess precision logic rather than here. */
19561 aarch64_promoted_type (const_tree t
)
19563 if (SCALAR_FLOAT_TYPE_P (t
)
19564 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
19565 return float_type_node
;
19570 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19573 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
19574 optimization_type opt_type
)
19579 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
19586 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19588 static unsigned int
19589 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
19592 /* Polynomial invariant 1 == (VG / 2) - 1. */
19593 gcc_assert (i
== 1);
19596 return AARCH64_DWARF_VG
;
19599 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19600 if MODE is HFmode, and punt to the generic implementation otherwise. */
19603 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
19605 return (mode
== HFmode
19607 : default_libgcc_floating_mode_supported_p (mode
));
19610 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19611 if MODE is HFmode, and punt to the generic implementation otherwise. */
19614 aarch64_scalar_mode_supported_p (scalar_mode mode
)
19616 return (mode
== HFmode
19618 : default_scalar_mode_supported_p (mode
));
19621 /* Set the value of FLT_EVAL_METHOD.
19622 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19624 0: evaluate all operations and constants, whose semantic type has at
19625 most the range and precision of type float, to the range and
19626 precision of float; evaluate all other operations and constants to
19627 the range and precision of the semantic type;
19629 N, where _FloatN is a supported interchange floating type
19630 evaluate all operations and constants, whose semantic type has at
19631 most the range and precision of _FloatN type, to the range and
19632 precision of the _FloatN type; evaluate all other operations and
19633 constants to the range and precision of the semantic type;
19635 If we have the ARMv8.2-A extensions then we support _Float16 in native
19636 precision, so we should set this to 16. Otherwise, we support the type,
19637 but want to evaluate expressions in float precision, so set this to
19640 static enum flt_eval_method
19641 aarch64_excess_precision (enum excess_precision_type type
)
19645 case EXCESS_PRECISION_TYPE_FAST
:
19646 case EXCESS_PRECISION_TYPE_STANDARD
:
19647 /* We can calculate either in 16-bit range and precision or
19648 32-bit range and precision. Make that decision based on whether
19649 we have native support for the ARMv8.2-A 16-bit floating-point
19650 instructions or not. */
19651 return (TARGET_FP_F16INST
19652 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19653 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
19654 case EXCESS_PRECISION_TYPE_IMPLICIT
:
19655 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
19657 gcc_unreachable ();
19659 return FLT_EVAL_METHOD_UNPREDICTABLE
;
19662 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19663 scheduled for speculative execution. Reject the long-running division
19664 and square-root instructions. */
19667 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
19669 switch (get_attr_type (insn
))
19677 case TYPE_NEON_FP_SQRT_S
:
19678 case TYPE_NEON_FP_SQRT_D
:
19679 case TYPE_NEON_FP_SQRT_S_Q
:
19680 case TYPE_NEON_FP_SQRT_D_Q
:
19681 case TYPE_NEON_FP_DIV_S
:
19682 case TYPE_NEON_FP_DIV_D
:
19683 case TYPE_NEON_FP_DIV_S_Q
:
19684 case TYPE_NEON_FP_DIV_D_Q
:
19691 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19694 aarch64_compute_pressure_classes (reg_class
*classes
)
19697 classes
[i
++] = GENERAL_REGS
;
19698 classes
[i
++] = FP_REGS
;
19699 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19700 registers need to go in PR_LO_REGS at some point during their
19701 lifetime. Splitting it into two halves has the effect of making
19702 all predicates count against PR_LO_REGS, so that we try whenever
19703 possible to restrict the number of live predicates to 8. This
19704 greatly reduces the amount of spilling in certain loops. */
19705 classes
[i
++] = PR_LO_REGS
;
19706 classes
[i
++] = PR_HI_REGS
;
19710 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19713 aarch64_can_change_mode_class (machine_mode from
,
19714 machine_mode to
, reg_class_t
)
19716 if (BYTES_BIG_ENDIAN
)
19718 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
19719 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
19721 /* Don't allow changes between SVE data modes and non-SVE modes.
19722 See the comment at the head of aarch64-sve.md for details. */
19723 if (from_sve_p
!= to_sve_p
)
19726 /* Don't allow changes in element size: lane 0 of the new vector
19727 would not then be lane 0 of the old vector. See the comment
19728 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19731 In the worst case, this forces a register to be spilled in
19732 one mode and reloaded in the other, which handles the
19733 endianness correctly. */
19734 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
19740 /* Implement TARGET_EARLY_REMAT_MODES. */
19743 aarch64_select_early_remat_modes (sbitmap modes
)
19745 /* SVE values are not normally live across a call, so it should be
19746 worth doing early rematerialization even in VL-specific mode. */
19747 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
19749 machine_mode mode
= (machine_mode
) i
;
19750 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
19751 if (vec_flags
& VEC_ANY_SVE
)
19752 bitmap_set_bit (modes
, i
);
19756 /* Override the default target speculation_safe_value. */
19758 aarch64_speculation_safe_value (machine_mode mode
,
19759 rtx result
, rtx val
, rtx failval
)
19761 /* Maybe we should warn if falling back to hard barriers. They are
19762 likely to be noticably more expensive than the alternative below. */
19763 if (!aarch64_track_speculation
)
19764 return default_speculation_safe_value (mode
, result
, val
, failval
);
19767 val
= copy_to_mode_reg (mode
, val
);
19769 if (!aarch64_reg_or_zero (failval
, mode
))
19770 failval
= copy_to_mode_reg (mode
, failval
);
19772 emit_insn (gen_despeculate_copy (mode
, result
, val
, failval
));
19776 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19777 Look into the tuning structure for an estimate.
19778 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19779 Advanced SIMD 128 bits. */
19781 static HOST_WIDE_INT
19782 aarch64_estimated_poly_value (poly_int64 val
)
19784 enum aarch64_sve_vector_bits_enum width_source
19785 = aarch64_tune_params
.sve_width
;
19787 /* If we still don't have an estimate, use the default. */
19788 if (width_source
== SVE_SCALABLE
)
19789 return default_estimated_poly_value (val
);
19791 HOST_WIDE_INT over_128
= width_source
- 128;
19792 return val
.coeffs
[0] + val
.coeffs
[1] * over_128
/ 128;
19796 /* Return true for types that could be supported as SIMD return or
19800 supported_simd_type (tree t
)
19802 if (SCALAR_FLOAT_TYPE_P (t
) || INTEGRAL_TYPE_P (t
) || POINTER_TYPE_P (t
))
19804 HOST_WIDE_INT s
= tree_to_shwi (TYPE_SIZE_UNIT (t
));
19805 return s
== 1 || s
== 2 || s
== 4 || s
== 8;
19810 /* Return true for types that currently are supported as SIMD return
19811 or argument types. */
19814 currently_supported_simd_type (tree t
, tree b
)
19816 if (COMPLEX_FLOAT_TYPE_P (t
))
19819 if (TYPE_SIZE (t
) != TYPE_SIZE (b
))
19822 return supported_simd_type (t
);
19825 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19828 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node
*node
,
19829 struct cgraph_simd_clone
*clonei
,
19830 tree base_type
, int num
)
19832 tree t
, ret_type
, arg_type
;
19833 unsigned int elt_bits
, vec_bits
, count
;
19838 if (clonei
->simdlen
19839 && (clonei
->simdlen
< 2
19840 || clonei
->simdlen
> 1024
19841 || (clonei
->simdlen
& (clonei
->simdlen
- 1)) != 0))
19843 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19844 "unsupported simdlen %d", clonei
->simdlen
);
19848 ret_type
= TREE_TYPE (TREE_TYPE (node
->decl
));
19849 if (TREE_CODE (ret_type
) != VOID_TYPE
19850 && !currently_supported_simd_type (ret_type
, base_type
))
19852 if (TYPE_SIZE (ret_type
) != TYPE_SIZE (base_type
))
19853 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19854 "GCC does not currently support mixed size types "
19855 "for %<simd%> functions");
19856 else if (supported_simd_type (ret_type
))
19857 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19858 "GCC does not currently support return type %qT "
19859 "for %<simd%> functions", ret_type
);
19861 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19862 "unsupported return type %qT for %<simd%> functions",
19867 for (t
= DECL_ARGUMENTS (node
->decl
); t
; t
= DECL_CHAIN (t
))
19869 arg_type
= TREE_TYPE (t
);
19871 if (!currently_supported_simd_type (arg_type
, base_type
))
19873 if (TYPE_SIZE (arg_type
) != TYPE_SIZE (base_type
))
19874 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19875 "GCC does not currently support mixed size types "
19876 "for %<simd%> functions");
19878 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19879 "GCC does not currently support argument type %qT "
19880 "for %<simd%> functions", arg_type
);
19885 clonei
->vecsize_mangle
= 'n';
19886 clonei
->mask_mode
= VOIDmode
;
19887 elt_bits
= GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type
));
19888 if (clonei
->simdlen
== 0)
19891 vec_bits
= (num
== 0 ? 64 : 128);
19892 clonei
->simdlen
= vec_bits
/ elt_bits
;
19897 vec_bits
= clonei
->simdlen
* elt_bits
;
19898 if (vec_bits
!= 64 && vec_bits
!= 128)
19900 warning_at (DECL_SOURCE_LOCATION (node
->decl
), 0,
19901 "GCC does not currently support simdlen %d for type %qT",
19902 clonei
->simdlen
, base_type
);
19906 clonei
->vecsize_int
= vec_bits
;
19907 clonei
->vecsize_float
= vec_bits
;
19911 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19914 aarch64_simd_clone_adjust (struct cgraph_node
*node
)
19916 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19917 use the correct ABI. */
19919 tree t
= TREE_TYPE (node
->decl
);
19920 TYPE_ATTRIBUTES (t
) = make_attribute ("aarch64_vector_pcs", "default",
19921 TYPE_ATTRIBUTES (t
));
19924 /* Implement TARGET_SIMD_CLONE_USABLE. */
19927 aarch64_simd_clone_usable (struct cgraph_node
*node
)
19929 switch (node
->simdclone
->vecsize_mangle
)
19936 gcc_unreachable ();
19940 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19943 aarch64_comp_type_attributes (const_tree type1
, const_tree type2
)
19945 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1
))
19946 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2
)))
19951 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19953 static const char *
19954 aarch64_get_multilib_abi_name (void)
19956 if (TARGET_BIG_END
)
19957 return TARGET_ILP32
? "aarch64_be_ilp32" : "aarch64_be";
19958 return TARGET_ILP32
? "aarch64_ilp32" : "aarch64";
19961 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19962 global variable based guard use the default else
19963 return a null tree. */
19965 aarch64_stack_protect_guard (void)
19967 if (aarch64_stack_protector_guard
== SSP_GLOBAL
)
19968 return default_stack_protect_guard ();
19973 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19974 section at the end if needed. */
19975 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19976 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19977 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19979 aarch64_file_end_indicate_exec_stack ()
19981 file_end_indicate_exec_stack ();
19983 unsigned feature_1_and
= 0;
19984 if (aarch64_bti_enabled ())
19985 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_BTI
;
19987 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
)
19988 feature_1_and
|= GNU_PROPERTY_AARCH64_FEATURE_1_PAC
;
19992 /* Generate .note.gnu.property section. */
19993 switch_to_section (get_section (".note.gnu.property",
19994 SECTION_NOTYPE
, NULL
));
19996 /* PT_NOTE header: namesz, descsz, type.
19997 namesz = 4 ("GNU\0")
19998 descsz = 16 (Size of the program property array)
19999 [(12 + padding) * Number of array elements]
20000 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20001 assemble_align (POINTER_SIZE
);
20002 assemble_integer (GEN_INT (4), 4, 32, 1);
20003 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES
)), 4, 32, 1);
20004 assemble_integer (GEN_INT (5), 4, 32, 1);
20006 /* PT_NOTE name. */
20007 assemble_string ("GNU", 4);
20009 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20010 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20012 data = feature_1_and. */
20013 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND
), 4, 32, 1);
20014 assemble_integer (GEN_INT (4), 4, 32, 1);
20015 assemble_integer (GEN_INT (feature_1_and
), 4, 32, 1);
20017 /* Pad the size of the note to the required alignment. */
20018 assemble_align (POINTER_SIZE
);
20021 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20022 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20023 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20025 /* Target-specific selftests. */
20029 namespace selftest
{
20031 /* Selftest for the RTL loader.
20032 Verify that the RTL loader copes with a dump from
20033 print_rtx_function. This is essentially just a test that class
20034 function_reader can handle a real dump, but it also verifies
20035 that lookup_reg_by_dump_name correctly handles hard regs.
20036 The presence of hard reg names in the dump means that the test is
20037 target-specific, hence it is in this file. */
20040 aarch64_test_loading_full_dump ()
20042 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
20044 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
20046 rtx_insn
*insn_1
= get_insn_by_uid (1);
20047 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
20049 rtx_insn
*insn_15
= get_insn_by_uid (15);
20050 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
20051 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
20053 /* Verify crtl->return_rtx. */
20054 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
20055 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
20056 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
20059 /* Run all target-specific selftests. */
20062 aarch64_run_selftests (void)
20064 aarch64_test_loading_full_dump ();
20067 } // namespace selftest
20069 #endif /* #if CHECKING_P */
20071 #undef TARGET_STACK_PROTECT_GUARD
20072 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20074 #undef TARGET_ADDRESS_COST
20075 #define TARGET_ADDRESS_COST aarch64_address_cost
20077 /* This hook will determines whether unnamed bitfields affect the alignment
20078 of the containing structure. The hook returns true if the structure
20079 should inherit the alignment requirements of an unnamed bitfield's
20081 #undef TARGET_ALIGN_ANON_BITFIELD
20082 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20084 #undef TARGET_ASM_ALIGNED_DI_OP
20085 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20087 #undef TARGET_ASM_ALIGNED_HI_OP
20088 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20090 #undef TARGET_ASM_ALIGNED_SI_OP
20091 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20093 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20094 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20095 hook_bool_const_tree_hwi_hwi_const_tree_true
20097 #undef TARGET_ASM_FILE_START
20098 #define TARGET_ASM_FILE_START aarch64_start_file
20100 #undef TARGET_ASM_OUTPUT_MI_THUNK
20101 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20103 #undef TARGET_ASM_SELECT_RTX_SECTION
20104 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20106 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20107 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20109 #undef TARGET_BUILD_BUILTIN_VA_LIST
20110 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20112 #undef TARGET_CALLEE_COPIES
20113 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20115 #undef TARGET_CAN_ELIMINATE
20116 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20118 #undef TARGET_CAN_INLINE_P
20119 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20121 #undef TARGET_CANNOT_FORCE_CONST_MEM
20122 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20124 #undef TARGET_CASE_VALUES_THRESHOLD
20125 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20127 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20128 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20130 /* Only the least significant bit is used for initialization guard
20132 #undef TARGET_CXX_GUARD_MASK_BIT
20133 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20135 #undef TARGET_C_MODE_FOR_SUFFIX
20136 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20138 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20139 #undef TARGET_DEFAULT_TARGET_FLAGS
20140 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20143 #undef TARGET_CLASS_MAX_NREGS
20144 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20146 #undef TARGET_BUILTIN_DECL
20147 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20149 #undef TARGET_BUILTIN_RECIPROCAL
20150 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20152 #undef TARGET_C_EXCESS_PRECISION
20153 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20155 #undef TARGET_EXPAND_BUILTIN
20156 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20158 #undef TARGET_EXPAND_BUILTIN_VA_START
20159 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20161 #undef TARGET_FOLD_BUILTIN
20162 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20164 #undef TARGET_FUNCTION_ARG
20165 #define TARGET_FUNCTION_ARG aarch64_function_arg
20167 #undef TARGET_FUNCTION_ARG_ADVANCE
20168 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20170 #undef TARGET_FUNCTION_ARG_BOUNDARY
20171 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20173 #undef TARGET_FUNCTION_ARG_PADDING
20174 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20176 #undef TARGET_GET_RAW_RESULT_MODE
20177 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20178 #undef TARGET_GET_RAW_ARG_MODE
20179 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20181 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20182 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20184 #undef TARGET_FUNCTION_VALUE
20185 #define TARGET_FUNCTION_VALUE aarch64_function_value
20187 #undef TARGET_FUNCTION_VALUE_REGNO_P
20188 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20190 #undef TARGET_GIMPLE_FOLD_BUILTIN
20191 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20193 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20194 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20196 #undef TARGET_INIT_BUILTINS
20197 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20199 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20200 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20201 aarch64_ira_change_pseudo_allocno_class
20203 #undef TARGET_LEGITIMATE_ADDRESS_P
20204 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20206 #undef TARGET_LEGITIMATE_CONSTANT_P
20207 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20209 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20210 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20211 aarch64_legitimize_address_displacement
20213 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20214 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20216 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20217 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20218 aarch64_libgcc_floating_mode_supported_p
20220 #undef TARGET_MANGLE_TYPE
20221 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20223 #undef TARGET_MEMORY_MOVE_COST
20224 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20226 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20227 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20229 #undef TARGET_MUST_PASS_IN_STACK
20230 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20232 /* This target hook should return true if accesses to volatile bitfields
20233 should use the narrowest mode possible. It should return false if these
20234 accesses should use the bitfield container type. */
20235 #undef TARGET_NARROW_VOLATILE_BITFIELD
20236 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20238 #undef TARGET_OPTION_OVERRIDE
20239 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20241 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20242 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20243 aarch64_override_options_after_change
20245 #undef TARGET_OPTION_SAVE
20246 #define TARGET_OPTION_SAVE aarch64_option_save
20248 #undef TARGET_OPTION_RESTORE
20249 #define TARGET_OPTION_RESTORE aarch64_option_restore
20251 #undef TARGET_OPTION_PRINT
20252 #define TARGET_OPTION_PRINT aarch64_option_print
20254 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20255 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20257 #undef TARGET_SET_CURRENT_FUNCTION
20258 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20260 #undef TARGET_PASS_BY_REFERENCE
20261 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20263 #undef TARGET_PREFERRED_RELOAD_CLASS
20264 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20266 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20267 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20269 #undef TARGET_PROMOTED_TYPE
20270 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20272 #undef TARGET_SECONDARY_RELOAD
20273 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20275 #undef TARGET_SHIFT_TRUNCATION_MASK
20276 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20278 #undef TARGET_SETUP_INCOMING_VARARGS
20279 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20281 #undef TARGET_STRUCT_VALUE_RTX
20282 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20284 #undef TARGET_REGISTER_MOVE_COST
20285 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20287 #undef TARGET_RETURN_IN_MEMORY
20288 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20290 #undef TARGET_RETURN_IN_MSB
20291 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20293 #undef TARGET_RTX_COSTS
20294 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20296 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20297 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20299 #undef TARGET_SCHED_ISSUE_RATE
20300 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20302 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20303 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20304 aarch64_sched_first_cycle_multipass_dfa_lookahead
20306 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20307 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20308 aarch64_first_cycle_multipass_dfa_lookahead_guard
20310 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20311 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20312 aarch64_get_separate_components
20314 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20315 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20316 aarch64_components_for_bb
20318 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20319 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20320 aarch64_disqualify_components
20322 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20323 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20324 aarch64_emit_prologue_components
20326 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20327 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20328 aarch64_emit_epilogue_components
20330 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20331 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20332 aarch64_set_handled_components
20334 #undef TARGET_TRAMPOLINE_INIT
20335 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20337 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20338 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20340 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20341 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20343 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20344 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20345 aarch64_builtin_support_vector_misalignment
20347 #undef TARGET_ARRAY_MODE
20348 #define TARGET_ARRAY_MODE aarch64_array_mode
20350 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20351 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20353 #undef TARGET_VECTORIZE_ADD_STMT_COST
20354 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20356 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20357 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20358 aarch64_builtin_vectorization_cost
20360 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20361 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20363 #undef TARGET_VECTORIZE_BUILTINS
20364 #define TARGET_VECTORIZE_BUILTINS
20366 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20367 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20368 aarch64_builtin_vectorized_function
20370 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20371 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20372 aarch64_autovectorize_vector_sizes
20374 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20375 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20376 aarch64_atomic_assign_expand_fenv
20378 /* Section anchor support. */
20380 #undef TARGET_MIN_ANCHOR_OFFSET
20381 #define TARGET_MIN_ANCHOR_OFFSET -256
20383 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20384 byte offset; we can do much more for larger data types, but have no way
20385 to determine the size of the access. We assume accesses are aligned. */
20386 #undef TARGET_MAX_ANCHOR_OFFSET
20387 #define TARGET_MAX_ANCHOR_OFFSET 4095
20389 #undef TARGET_VECTOR_ALIGNMENT
20390 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20392 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20393 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20394 aarch64_vectorize_preferred_vector_alignment
20395 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20396 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20397 aarch64_simd_vector_alignment_reachable
20399 /* vec_perm support. */
20401 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20402 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20403 aarch64_vectorize_vec_perm_const
20405 #undef TARGET_VECTORIZE_GET_MASK_MODE
20406 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20407 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20408 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20409 aarch64_empty_mask_is_expensive
20410 #undef TARGET_PREFERRED_ELSE_VALUE
20411 #define TARGET_PREFERRED_ELSE_VALUE \
20412 aarch64_preferred_else_value
20414 #undef TARGET_INIT_LIBFUNCS
20415 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20417 #undef TARGET_FIXED_CONDITION_CODE_REGS
20418 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20420 #undef TARGET_FLAGS_REGNUM
20421 #define TARGET_FLAGS_REGNUM CC_REGNUM
20423 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20424 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20426 #undef TARGET_ASAN_SHADOW_OFFSET
20427 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20429 #undef TARGET_LEGITIMIZE_ADDRESS
20430 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20432 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20433 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20435 #undef TARGET_CAN_USE_DOLOOP_P
20436 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20438 #undef TARGET_SCHED_ADJUST_PRIORITY
20439 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20441 #undef TARGET_SCHED_MACRO_FUSION_P
20442 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20444 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20445 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20447 #undef TARGET_SCHED_FUSION_PRIORITY
20448 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20450 #undef TARGET_UNSPEC_MAY_TRAP_P
20451 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20453 #undef TARGET_USE_PSEUDO_PIC_REG
20454 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20456 #undef TARGET_PRINT_OPERAND
20457 #define TARGET_PRINT_OPERAND aarch64_print_operand
20459 #undef TARGET_PRINT_OPERAND_ADDRESS
20460 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20462 #undef TARGET_OPTAB_SUPPORTED_P
20463 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20465 #undef TARGET_OMIT_STRUCT_RETURN_REG
20466 #define TARGET_OMIT_STRUCT_RETURN_REG true
20468 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20469 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20470 aarch64_dwarf_poly_indeterminate_value
20472 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20473 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20474 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20476 #undef TARGET_HARD_REGNO_NREGS
20477 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20478 #undef TARGET_HARD_REGNO_MODE_OK
20479 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20481 #undef TARGET_MODES_TIEABLE_P
20482 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20484 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20485 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20486 aarch64_hard_regno_call_part_clobbered
20488 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20489 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20490 aarch64_remove_extra_call_preserved_regs
20492 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20493 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20494 aarch64_return_call_with_max_clobbers
20496 #undef TARGET_CONSTANT_ALIGNMENT
20497 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20499 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20500 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20501 aarch64_stack_clash_protection_alloca_probe_range
20503 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20504 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20506 #undef TARGET_CAN_CHANGE_MODE_CLASS
20507 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20509 #undef TARGET_SELECT_EARLY_REMAT_MODES
20510 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20512 #undef TARGET_SPECULATION_SAFE_VALUE
20513 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20515 #undef TARGET_ESTIMATED_POLY_VALUE
20516 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20518 #undef TARGET_ATTRIBUTE_TABLE
20519 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20521 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20522 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20523 aarch64_simd_clone_compute_vecsize_and_simdlen
20525 #undef TARGET_SIMD_CLONE_ADJUST
20526 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20528 #undef TARGET_SIMD_CLONE_USABLE
20529 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20531 #undef TARGET_COMP_TYPE_ATTRIBUTES
20532 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20534 #undef TARGET_GET_MULTILIB_ABI_NAME
20535 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20538 #undef TARGET_RUN_TARGET_SELFTESTS
20539 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20540 #endif /* #if CHECKING_P */
20542 #undef TARGET_ASM_POST_CFI_STARTPROC
20543 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20545 struct gcc_target targetm
= TARGET_INITIALIZER
;
20547 #include "gt-aarch64.h"