1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
24 #define INCLUDE_STRING
26 #include "coretypes.h"
37 #include "stringpool.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
55 #include "langhooks.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
84 A simple base register plus immediate offset.
87 A base register indexed by immediate offset with writeback.
90 A base register indexed by (optionally scaled) register.
93 A base register indexed by (optionally scaled) zero-extended register.
96 A base register indexed by (optionally scaled) sign-extended register.
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type
{
114 struct aarch64_address_info
{
115 enum aarch64_address_type type
;
118 poly_int64 const_offset
;
120 enum aarch64_symbol_type symbol_type
;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type
{ MOV
, MVN
};
127 enum modifier_type
{ LSL
, MSL
};
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode
, rtx
);
131 simd_immediate_info (scalar_int_mode
, unsigned HOST_WIDE_INT
,
132 insn_type
= MOV
, modifier_type
= LSL
,
134 simd_immediate_info (scalar_mode
, rtx
, rtx
);
136 /* The mode of the elements. */
137 scalar_mode elt_mode
;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
143 /* The value of the step if the constant is a series, null otherwise. */
146 /* The instruction to use to move the immediate into a vector. */
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier
;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in
, rtx value_in
)
159 : elt_mode (elt_mode_in
), value (value_in
), step (NULL_RTX
), insn (MOV
),
160 modifier (LSL
), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in
,
168 unsigned HOST_WIDE_INT value_in
,
169 insn_type insn_in
, modifier_type modifier_in
,
170 unsigned int shift_in
)
171 : elt_mode (elt_mode_in
), value (gen_int_mode (value_in
, elt_mode_in
)),
172 step (NULL_RTX
), insn (insn_in
), modifier (modifier_in
), shift (shift_in
)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in
, rtx value_in
, rtx step_in
)
179 : elt_mode (elt_mode_in
), value (value_in
), step (step_in
), insn (MOV
),
180 modifier (LSL
), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel
;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg
;
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
194 static bool aarch64_composite_type_p (const_tree
, machine_mode
);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode
,
197 machine_mode
*, int *,
199 static void aarch64_elf_asm_constructor (rtx
, int) ATTRIBUTE_UNUSED
;
200 static void aarch64_elf_asm_destructor (rtx
, int) ATTRIBUTE_UNUSED
;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode
);
203 static int aarch64_address_cost (rtx
, machine_mode
, addr_space_t
, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode
,
208 static machine_mode
aarch64_simd_container_mode (scalar_mode
, poly_int64
);
209 static bool aarch64_print_address_internal (FILE*, machine_mode
, rtx
,
210 aarch64_addr_query_type
);
212 /* Major revision number of the ARM Architecture implemented by the target. */
213 unsigned aarch64_architecture_version
;
215 /* The processor for which instructions should be scheduled. */
216 enum aarch64_processor aarch64_tune
= cortexa53
;
218 /* Mask to specify which instruction scheduling options should be used. */
219 unsigned long aarch64_tune_flags
= 0;
221 /* Global flag for PC relative loads. */
222 bool aarch64_pcrelative_literal_loads
;
224 /* Global flag for whether frame pointer is enabled. */
225 bool aarch64_use_frame_pointer
;
227 /* Support for command line parsing of boolean flags in the tuning
229 struct aarch64_flag_desc
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs
[] =
239 { "none", AARCH64_FUSE_NOTHING
},
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL
},
242 { NULL
, AARCH64_FUSE_NOTHING
}
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags
[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE
},
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL
},
252 { NULL
, AARCH64_EXTRA_TUNE_NONE
}
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table
=
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table
=
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
289 static const struct cpu_addrcost_table xgene1_addrcost_table
=
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table
=
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
321 static const struct cpu_addrcost_table qdf24xx_addrcost_table
=
331 3, /* register_offset */
332 4, /* register_sextend */
333 3, /* register_zextend */
337 static const struct cpu_regmove_cost generic_regmove_cost
=
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
347 static const struct cpu_regmove_cost cortexa57_regmove_cost
=
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost. */
357 static const struct cpu_regmove_cost cortexa53_regmove_cost
=
360 /* Avoid the use of slow int<->fp moves for spilling by setting
361 their cost higher than memmov_cost. */
367 static const struct cpu_regmove_cost exynosm1_regmove_cost
=
370 /* Avoid the use of slow int<->fp moves for spilling by setting
371 their cost higher than memmov_cost (actual, 4 and 9). */
377 static const struct cpu_regmove_cost thunderx_regmove_cost
=
385 static const struct cpu_regmove_cost xgene1_regmove_cost
=
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost. */
395 static const struct cpu_regmove_cost qdf24xx_regmove_cost
=
398 /* Avoid the use of int<->fp moves for spilling. */
404 static const struct cpu_regmove_cost thunderx2t99_regmove_cost
=
407 /* Avoid the use of int<->fp moves for spilling. */
413 /* Generic costs for vector insn classes. */
414 static const struct cpu_vector_cost generic_vector_cost
=
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 1, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 1, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 2, /* vec_permute_cost */
423 1, /* vec_to_scalar_cost */
424 1, /* scalar_to_vec_cost */
425 1, /* vec_align_load_cost */
426 1, /* vec_unalign_load_cost */
427 1, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 1 /* cond_not_taken_branch_cost */
433 /* ThunderX costs for vector insn classes. */
434 static const struct cpu_vector_cost thunderx_vector_cost
=
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 3, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 4, /* vec_int_stmt_cost */
441 1, /* vec_fp_stmt_cost */
442 4, /* vec_permute_cost */
443 2, /* vec_to_scalar_cost */
444 2, /* scalar_to_vec_cost */
445 3, /* vec_align_load_cost */
446 5, /* vec_unalign_load_cost */
447 5, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 3, /* cond_taken_branch_cost */
450 3 /* cond_not_taken_branch_cost */
453 /* Generic costs for vector insn classes. */
454 static const struct cpu_vector_cost cortexa57_vector_cost
=
456 1, /* scalar_int_stmt_cost */
457 1, /* scalar_fp_stmt_cost */
458 4, /* scalar_load_cost */
459 1, /* scalar_store_cost */
460 2, /* vec_int_stmt_cost */
461 2, /* vec_fp_stmt_cost */
462 3, /* vec_permute_cost */
463 8, /* vec_to_scalar_cost */
464 8, /* scalar_to_vec_cost */
465 4, /* vec_align_load_cost */
466 4, /* vec_unalign_load_cost */
467 1, /* vec_unalign_store_cost */
468 1, /* vec_store_cost */
469 1, /* cond_taken_branch_cost */
470 1 /* cond_not_taken_branch_cost */
473 static const struct cpu_vector_cost exynosm1_vector_cost
=
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 3, /* vec_int_stmt_cost */
480 3, /* vec_fp_stmt_cost */
481 3, /* vec_permute_cost */
482 3, /* vec_to_scalar_cost */
483 3, /* scalar_to_vec_cost */
484 5, /* vec_align_load_cost */
485 5, /* vec_unalign_load_cost */
486 1, /* vec_unalign_store_cost */
487 1, /* vec_store_cost */
488 1, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Generic costs for vector insn classes. */
493 static const struct cpu_vector_cost xgene1_vector_cost
=
495 1, /* scalar_int_stmt_cost */
496 1, /* scalar_fp_stmt_cost */
497 5, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 2, /* vec_int_stmt_cost */
500 2, /* vec_fp_stmt_cost */
501 2, /* vec_permute_cost */
502 4, /* vec_to_scalar_cost */
503 4, /* scalar_to_vec_cost */
504 10, /* vec_align_load_cost */
505 10, /* vec_unalign_load_cost */
506 2, /* vec_unalign_store_cost */
507 2, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Costs for vector insn classes for Vulcan. */
513 static const struct cpu_vector_cost thunderx2t99_vector_cost
=
515 1, /* scalar_int_stmt_cost */
516 6, /* scalar_fp_stmt_cost */
517 4, /* scalar_load_cost */
518 1, /* scalar_store_cost */
519 5, /* vec_int_stmt_cost */
520 6, /* vec_fp_stmt_cost */
521 3, /* vec_permute_cost */
522 6, /* vec_to_scalar_cost */
523 5, /* scalar_to_vec_cost */
524 8, /* vec_align_load_cost */
525 8, /* vec_unalign_load_cost */
526 4, /* vec_unalign_store_cost */
527 4, /* vec_store_cost */
528 2, /* cond_taken_branch_cost */
529 1 /* cond_not_taken_branch_cost */
532 /* Generic costs for branch instructions. */
533 static const struct cpu_branch_cost generic_branch_cost
=
535 1, /* Predictable. */
536 3 /* Unpredictable. */
539 /* Generic approximation modes. */
540 static const cpu_approx_modes generic_approx_modes
=
542 AARCH64_APPROX_NONE
, /* division */
543 AARCH64_APPROX_NONE
, /* sqrt */
544 AARCH64_APPROX_NONE
/* recip_sqrt */
547 /* Approximation modes for Exynos M1. */
548 static const cpu_approx_modes exynosm1_approx_modes
=
550 AARCH64_APPROX_NONE
, /* division */
551 AARCH64_APPROX_ALL
, /* sqrt */
552 AARCH64_APPROX_ALL
/* recip_sqrt */
555 /* Approximation modes for X-Gene 1. */
556 static const cpu_approx_modes xgene1_approx_modes
=
558 AARCH64_APPROX_NONE
, /* division */
559 AARCH64_APPROX_NONE
, /* sqrt */
560 AARCH64_APPROX_ALL
/* recip_sqrt */
563 /* Generic prefetch settings (which disable prefetch). */
564 static const cpu_prefetch_tune generic_prefetch_tune
=
567 -1, /* l1_cache_size */
568 -1, /* l1_cache_line_size */
569 -1, /* l2_cache_size */
570 true, /* prefetch_dynamic_strides */
571 -1, /* minimum_stride */
572 -1 /* default_opt_level */
575 static const cpu_prefetch_tune exynosm1_prefetch_tune
=
578 -1, /* l1_cache_size */
579 64, /* l1_cache_line_size */
580 -1, /* l2_cache_size */
581 true, /* prefetch_dynamic_strides */
582 -1, /* minimum_stride */
583 -1 /* default_opt_level */
586 static const cpu_prefetch_tune qdf24xx_prefetch_tune
=
589 32, /* l1_cache_size */
590 64, /* l1_cache_line_size */
591 512, /* l2_cache_size */
592 false, /* prefetch_dynamic_strides */
593 2048, /* minimum_stride */
594 3 /* default_opt_level */
597 static const cpu_prefetch_tune thunderxt88_prefetch_tune
=
600 32, /* l1_cache_size */
601 128, /* l1_cache_line_size */
602 16*1024, /* l2_cache_size */
603 true, /* prefetch_dynamic_strides */
604 -1, /* minimum_stride */
605 3 /* default_opt_level */
608 static const cpu_prefetch_tune thunderx_prefetch_tune
=
611 32, /* l1_cache_size */
612 128, /* l1_cache_line_size */
613 -1, /* l2_cache_size */
614 true, /* prefetch_dynamic_strides */
615 -1, /* minimum_stride */
616 -1 /* default_opt_level */
619 static const cpu_prefetch_tune thunderx2t99_prefetch_tune
=
622 32, /* l1_cache_size */
623 64, /* l1_cache_line_size */
624 256, /* l2_cache_size */
625 true, /* prefetch_dynamic_strides */
626 -1, /* minimum_stride */
627 -1 /* default_opt_level */
630 static const struct tune_params generic_tunings
=
632 &cortexa57_extra_costs
,
633 &generic_addrcost_table
,
634 &generic_regmove_cost
,
635 &generic_vector_cost
,
636 &generic_branch_cost
,
637 &generic_approx_modes
,
640 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
641 "8", /* function_align. */
642 "4", /* jump_align. */
643 "8", /* loop_align. */
644 2, /* int_reassoc_width. */
645 4, /* fp_reassoc_width. */
646 1, /* vec_reassoc_width. */
647 2, /* min_div_recip_mul_sf. */
648 2, /* min_div_recip_mul_df. */
649 0, /* max_case_values. */
650 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
651 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
652 &generic_prefetch_tune
655 static const struct tune_params cortexa35_tunings
=
657 &cortexa53_extra_costs
,
658 &generic_addrcost_table
,
659 &cortexa53_regmove_cost
,
660 &generic_vector_cost
,
661 &generic_branch_cost
,
662 &generic_approx_modes
,
665 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
666 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
667 "16", /* function_align. */
668 "4", /* jump_align. */
669 "8", /* loop_align. */
670 2, /* int_reassoc_width. */
671 4, /* fp_reassoc_width. */
672 1, /* vec_reassoc_width. */
673 2, /* min_div_recip_mul_sf. */
674 2, /* min_div_recip_mul_df. */
675 0, /* max_case_values. */
676 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
677 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
678 &generic_prefetch_tune
681 static const struct tune_params cortexa53_tunings
=
683 &cortexa53_extra_costs
,
684 &generic_addrcost_table
,
685 &cortexa53_regmove_cost
,
686 &generic_vector_cost
,
687 &generic_branch_cost
,
688 &generic_approx_modes
,
691 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
692 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
693 "16", /* function_align. */
694 "4", /* jump_align. */
695 "8", /* loop_align. */
696 2, /* int_reassoc_width. */
697 4, /* fp_reassoc_width. */
698 1, /* vec_reassoc_width. */
699 2, /* min_div_recip_mul_sf. */
700 2, /* min_div_recip_mul_df. */
701 0, /* max_case_values. */
702 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
703 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
704 &generic_prefetch_tune
707 static const struct tune_params cortexa57_tunings
=
709 &cortexa57_extra_costs
,
710 &generic_addrcost_table
,
711 &cortexa57_regmove_cost
,
712 &cortexa57_vector_cost
,
713 &generic_branch_cost
,
714 &generic_approx_modes
,
717 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
718 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
719 "16", /* function_align. */
720 "4", /* jump_align. */
721 "8", /* loop_align. */
722 2, /* int_reassoc_width. */
723 4, /* fp_reassoc_width. */
724 1, /* vec_reassoc_width. */
725 2, /* min_div_recip_mul_sf. */
726 2, /* min_div_recip_mul_df. */
727 0, /* max_case_values. */
728 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
729 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
), /* tune_flags. */
730 &generic_prefetch_tune
733 static const struct tune_params cortexa72_tunings
=
735 &cortexa57_extra_costs
,
736 &generic_addrcost_table
,
737 &cortexa57_regmove_cost
,
738 &cortexa57_vector_cost
,
739 &generic_branch_cost
,
740 &generic_approx_modes
,
743 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
744 | AARCH64_FUSE_MOVK_MOVK
), /* fusible_ops */
745 "16", /* function_align. */
746 "4", /* jump_align. */
747 "8", /* loop_align. */
748 2, /* int_reassoc_width. */
749 4, /* fp_reassoc_width. */
750 1, /* vec_reassoc_width. */
751 2, /* min_div_recip_mul_sf. */
752 2, /* min_div_recip_mul_df. */
753 0, /* max_case_values. */
754 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
755 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
756 &generic_prefetch_tune
759 static const struct tune_params cortexa73_tunings
=
761 &cortexa57_extra_costs
,
762 &generic_addrcost_table
,
763 &cortexa57_regmove_cost
,
764 &cortexa57_vector_cost
,
765 &generic_branch_cost
,
766 &generic_approx_modes
,
767 4, /* memmov_cost. */
769 (AARCH64_FUSE_AES_AESMC
| AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
770 | AARCH64_FUSE_MOVK_MOVK
| AARCH64_FUSE_ADRP_LDR
), /* fusible_ops */
771 "16", /* function_align. */
772 "4", /* jump_align. */
773 "8", /* loop_align. */
774 2, /* int_reassoc_width. */
775 4, /* fp_reassoc_width. */
776 1, /* vec_reassoc_width. */
777 2, /* min_div_recip_mul_sf. */
778 2, /* min_div_recip_mul_df. */
779 0, /* max_case_values. */
780 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
781 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
782 &generic_prefetch_tune
787 static const struct tune_params exynosm1_tunings
=
789 &exynosm1_extra_costs
,
790 &exynosm1_addrcost_table
,
791 &exynosm1_regmove_cost
,
792 &exynosm1_vector_cost
,
793 &generic_branch_cost
,
794 &exynosm1_approx_modes
,
797 (AARCH64_FUSE_AES_AESMC
), /* fusible_ops */
798 "4", /* function_align. */
799 "4", /* jump_align. */
800 "4", /* loop_align. */
801 2, /* int_reassoc_width. */
802 4, /* fp_reassoc_width. */
803 1, /* vec_reassoc_width. */
804 2, /* min_div_recip_mul_sf. */
805 2, /* min_div_recip_mul_df. */
806 48, /* max_case_values. */
807 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
808 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
809 &exynosm1_prefetch_tune
812 static const struct tune_params thunderxt88_tunings
=
814 &thunderx_extra_costs
,
815 &generic_addrcost_table
,
816 &thunderx_regmove_cost
,
817 &thunderx_vector_cost
,
818 &generic_branch_cost
,
819 &generic_approx_modes
,
822 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
823 "8", /* function_align. */
824 "8", /* jump_align. */
825 "8", /* loop_align. */
826 2, /* int_reassoc_width. */
827 4, /* fp_reassoc_width. */
828 1, /* vec_reassoc_width. */
829 2, /* min_div_recip_mul_sf. */
830 2, /* min_div_recip_mul_df. */
831 0, /* max_case_values. */
832 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
833 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
), /* tune_flags. */
834 &thunderxt88_prefetch_tune
837 static const struct tune_params thunderx_tunings
=
839 &thunderx_extra_costs
,
840 &generic_addrcost_table
,
841 &thunderx_regmove_cost
,
842 &thunderx_vector_cost
,
843 &generic_branch_cost
,
844 &generic_approx_modes
,
847 AARCH64_FUSE_CMP_BRANCH
, /* fusible_ops */
848 "8", /* function_align. */
849 "8", /* jump_align. */
850 "8", /* loop_align. */
851 2, /* int_reassoc_width. */
852 4, /* fp_reassoc_width. */
853 1, /* vec_reassoc_width. */
854 2, /* min_div_recip_mul_sf. */
855 2, /* min_div_recip_mul_df. */
856 0, /* max_case_values. */
857 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
858 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
859 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
), /* tune_flags. */
860 &thunderx_prefetch_tune
863 static const struct tune_params xgene1_tunings
=
866 &xgene1_addrcost_table
,
867 &xgene1_regmove_cost
,
869 &generic_branch_cost
,
870 &xgene1_approx_modes
,
873 AARCH64_FUSE_NOTHING
, /* fusible_ops */
874 "16", /* function_align. */
875 "8", /* jump_align. */
876 "16", /* loop_align. */
877 2, /* int_reassoc_width. */
878 4, /* fp_reassoc_width. */
879 1, /* vec_reassoc_width. */
880 2, /* min_div_recip_mul_sf. */
881 2, /* min_div_recip_mul_df. */
882 0, /* max_case_values. */
883 tune_params::AUTOPREFETCHER_OFF
, /* autoprefetcher_model. */
884 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
), /* tune_flags. */
885 &generic_prefetch_tune
888 static const struct tune_params qdf24xx_tunings
=
890 &qdf24xx_extra_costs
,
891 &qdf24xx_addrcost_table
,
892 &qdf24xx_regmove_cost
,
893 &generic_vector_cost
,
894 &generic_branch_cost
,
895 &generic_approx_modes
,
898 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
899 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
900 "16", /* function_align. */
901 "8", /* jump_align. */
902 "16", /* loop_align. */
903 2, /* int_reassoc_width. */
904 4, /* fp_reassoc_width. */
905 1, /* vec_reassoc_width. */
906 2, /* min_div_recip_mul_sf. */
907 2, /* min_div_recip_mul_df. */
908 0, /* max_case_values. */
909 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
910 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
911 &qdf24xx_prefetch_tune
914 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
916 static const struct tune_params saphira_tunings
=
918 &generic_extra_costs
,
919 &generic_addrcost_table
,
920 &generic_regmove_cost
,
921 &generic_vector_cost
,
922 &generic_branch_cost
,
923 &generic_approx_modes
,
926 (AARCH64_FUSE_MOV_MOVK
| AARCH64_FUSE_ADRP_ADD
927 | AARCH64_FUSE_MOVK_MOVK
), /* fuseable_ops */
928 "16", /* function_align. */
929 "8", /* jump_align. */
930 "16", /* loop_align. */
931 2, /* int_reassoc_width. */
932 4, /* fp_reassoc_width. */
933 1, /* vec_reassoc_width. */
934 2, /* min_div_recip_mul_sf. */
935 2, /* min_div_recip_mul_df. */
936 0, /* max_case_values. */
937 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
938 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
939 &generic_prefetch_tune
942 static const struct tune_params thunderx2t99_tunings
=
944 &thunderx2t99_extra_costs
,
945 &thunderx2t99_addrcost_table
,
946 &thunderx2t99_regmove_cost
,
947 &thunderx2t99_vector_cost
,
948 &generic_branch_cost
,
949 &generic_approx_modes
,
950 4, /* memmov_cost. */
952 (AARCH64_FUSE_CMP_BRANCH
| AARCH64_FUSE_AES_AESMC
953 | AARCH64_FUSE_ALU_BRANCH
), /* fusible_ops */
954 "16", /* function_align. */
955 "8", /* jump_align. */
956 "16", /* loop_align. */
957 3, /* int_reassoc_width. */
958 2, /* fp_reassoc_width. */
959 2, /* vec_reassoc_width. */
960 2, /* min_div_recip_mul_sf. */
961 2, /* min_div_recip_mul_df. */
962 0, /* max_case_values. */
963 tune_params::AUTOPREFETCHER_WEAK
, /* autoprefetcher_model. */
964 (AARCH64_EXTRA_TUNE_NONE
), /* tune_flags. */
965 &thunderx2t99_prefetch_tune
968 /* Support for fine-grained override of the tuning structures. */
969 struct aarch64_tuning_override_function
972 void (*parse_override
)(const char*, struct tune_params
*);
975 static void aarch64_parse_fuse_string (const char*, struct tune_params
*);
976 static void aarch64_parse_tune_string (const char*, struct tune_params
*);
978 static const struct aarch64_tuning_override_function
979 aarch64_tuning_override_functions
[] =
981 { "fuse", aarch64_parse_fuse_string
},
982 { "tune", aarch64_parse_tune_string
},
986 /* A processor implementing AArch64. */
989 const char *const name
;
990 enum aarch64_processor ident
;
991 enum aarch64_processor sched_core
;
992 enum aarch64_arch arch
;
993 unsigned architecture_version
;
994 const unsigned long flags
;
995 const struct tune_params
*const tune
;
998 /* Architectures implementing AArch64. */
999 static const struct processor all_architectures
[] =
1001 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1002 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1003 #include "aarch64-arches.def"
1004 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1007 /* Processor cores implementing AArch64. */
1008 static const struct processor all_cores
[] =
1010 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1011 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1012 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1013 FLAGS, &COSTS##_tunings},
1014 #include "aarch64-cores.def"
1015 {"generic", generic
, cortexa53
, AARCH64_ARCH_8A
, 8,
1016 AARCH64_FL_FOR_ARCH8
, &generic_tunings
},
1017 {NULL
, aarch64_none
, aarch64_none
, aarch64_no_arch
, 0, 0, NULL
}
1021 /* Target specification. These are populated by the -march, -mtune, -mcpu
1022 handling code or by target attributes. */
1023 static const struct processor
*selected_arch
;
1024 static const struct processor
*selected_cpu
;
1025 static const struct processor
*selected_tune
;
1027 /* The current tuning set. */
1028 struct tune_params aarch64_tune_params
= generic_tunings
;
1030 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1032 /* An ISA extension in the co-processor and main instruction set space. */
1033 struct aarch64_option_extension
1035 const char *const name
;
1036 const unsigned long flags_on
;
1037 const unsigned long flags_off
;
1040 typedef enum aarch64_cond_code
1042 AARCH64_EQ
= 0, AARCH64_NE
, AARCH64_CS
, AARCH64_CC
, AARCH64_MI
, AARCH64_PL
,
1043 AARCH64_VS
, AARCH64_VC
, AARCH64_HI
, AARCH64_LS
, AARCH64_GE
, AARCH64_LT
,
1044 AARCH64_GT
, AARCH64_LE
, AARCH64_AL
, AARCH64_NV
1048 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1050 /* The condition codes of the processor, and the inverse function. */
1051 static const char * const aarch64_condition_codes
[] =
1053 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1054 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1057 /* Generate code to enable conditional branches in functions over 1 MiB. */
1059 aarch64_gen_far_branch (rtx
* operands
, int pos_label
, const char * dest
,
1060 const char * branch_format
)
1062 rtx_code_label
* tmp_label
= gen_label_rtx ();
1063 char label_buf
[256];
1065 ASM_GENERATE_INTERNAL_LABEL (label_buf
, dest
,
1066 CODE_LABEL_NUMBER (tmp_label
));
1067 const char *label_ptr
= targetm
.strip_name_encoding (label_buf
);
1068 rtx dest_label
= operands
[pos_label
];
1069 operands
[pos_label
] = tmp_label
;
1071 snprintf (buffer
, sizeof (buffer
), "%s%s", branch_format
, label_ptr
);
1072 output_asm_insn (buffer
, operands
);
1074 snprintf (buffer
, sizeof (buffer
), "b\t%%l%d\n%s:", pos_label
, label_ptr
);
1075 operands
[pos_label
] = dest_label
;
1076 output_asm_insn (buffer
, operands
);
1081 aarch64_err_no_fpadvsimd (machine_mode mode
)
1083 if (TARGET_GENERAL_REGS_ONLY
)
1084 if (FLOAT_MODE_P (mode
))
1085 error ("%qs is incompatible with the use of floating-point types",
1086 "-mgeneral-regs-only");
1088 error ("%qs is incompatible with the use of vector types",
1089 "-mgeneral-regs-only");
1091 if (FLOAT_MODE_P (mode
))
1092 error ("%qs feature modifier is incompatible with the use of"
1093 " floating-point types", "+nofp");
1095 error ("%qs feature modifier is incompatible with the use of"
1096 " vector types", "+nofp");
1099 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1100 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1101 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1102 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1103 and GENERAL_REGS is lower than the memory cost (in this case the best class
1104 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1105 cost results in bad allocations with many redundant int<->FP moves which
1106 are expensive on various cores.
1107 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1108 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1109 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1110 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1111 The result of this is that it is no longer inefficient to have a higher
1112 memory move cost than the register move cost.
1116 aarch64_ira_change_pseudo_allocno_class (int regno
, reg_class_t allocno_class
,
1117 reg_class_t best_class
)
1121 if (!reg_class_subset_p (GENERAL_REGS
, allocno_class
)
1122 || !reg_class_subset_p (FP_REGS
, allocno_class
))
1123 return allocno_class
;
1125 if (!reg_class_subset_p (GENERAL_REGS
, best_class
)
1126 || !reg_class_subset_p (FP_REGS
, best_class
))
1129 mode
= PSEUDO_REGNO_MODE (regno
);
1130 return FLOAT_MODE_P (mode
) || VECTOR_MODE_P (mode
) ? FP_REGS
: GENERAL_REGS
;
1134 aarch64_min_divisions_for_recip_mul (machine_mode mode
)
1136 if (GET_MODE_UNIT_SIZE (mode
) == 4)
1137 return aarch64_tune_params
.min_div_recip_mul_sf
;
1138 return aarch64_tune_params
.min_div_recip_mul_df
;
1141 /* Return the reassociation width of treeop OPC with mode MODE. */
1143 aarch64_reassociation_width (unsigned opc
, machine_mode mode
)
1145 if (VECTOR_MODE_P (mode
))
1146 return aarch64_tune_params
.vec_reassoc_width
;
1147 if (INTEGRAL_MODE_P (mode
))
1148 return aarch64_tune_params
.int_reassoc_width
;
1149 /* Avoid reassociating floating point addition so we emit more FMAs. */
1150 if (FLOAT_MODE_P (mode
) && opc
!= PLUS_EXPR
)
1151 return aarch64_tune_params
.fp_reassoc_width
;
1155 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1157 aarch64_dbx_register_number (unsigned regno
)
1159 if (GP_REGNUM_P (regno
))
1160 return AARCH64_DWARF_R0
+ regno
- R0_REGNUM
;
1161 else if (regno
== SP_REGNUM
)
1162 return AARCH64_DWARF_SP
;
1163 else if (FP_REGNUM_P (regno
))
1164 return AARCH64_DWARF_V0
+ regno
- V0_REGNUM
;
1165 else if (PR_REGNUM_P (regno
))
1166 return AARCH64_DWARF_P0
+ regno
- P0_REGNUM
;
1167 else if (regno
== VG_REGNUM
)
1168 return AARCH64_DWARF_VG
;
1170 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1171 equivalent DWARF register. */
1172 return DWARF_FRAME_REGISTERS
;
1175 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1177 aarch64_advsimd_struct_mode_p (machine_mode mode
)
1180 && (mode
== OImode
|| mode
== CImode
|| mode
== XImode
));
1183 /* Return true if MODE is an SVE predicate mode. */
1185 aarch64_sve_pred_mode_p (machine_mode mode
)
1188 && (mode
== VNx16BImode
1189 || mode
== VNx8BImode
1190 || mode
== VNx4BImode
1191 || mode
== VNx2BImode
));
1194 /* Three mutually-exclusive flags describing a vector or predicate type. */
1195 const unsigned int VEC_ADVSIMD
= 1;
1196 const unsigned int VEC_SVE_DATA
= 2;
1197 const unsigned int VEC_SVE_PRED
= 4;
1198 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1199 a structure of 2, 3 or 4 vectors. */
1200 const unsigned int VEC_STRUCT
= 8;
1201 /* Useful combinations of the above. */
1202 const unsigned int VEC_ANY_SVE
= VEC_SVE_DATA
| VEC_SVE_PRED
;
1203 const unsigned int VEC_ANY_DATA
= VEC_ADVSIMD
| VEC_SVE_DATA
;
1205 /* Return a set of flags describing the vector properties of mode MODE.
1206 Ignore modes that are not supported by the current target. */
1208 aarch64_classify_vector_mode (machine_mode mode
)
1210 if (aarch64_advsimd_struct_mode_p (mode
))
1211 return VEC_ADVSIMD
| VEC_STRUCT
;
1213 if (aarch64_sve_pred_mode_p (mode
))
1214 return VEC_SVE_PRED
;
1216 scalar_mode inner
= GET_MODE_INNER (mode
);
1217 if (VECTOR_MODE_P (mode
)
1224 || inner
== DFmode
))
1228 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
))
1229 return VEC_SVE_DATA
;
1230 if (known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 2)
1231 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 3)
1232 || known_eq (GET_MODE_BITSIZE (mode
), BITS_PER_SVE_VECTOR
* 4))
1233 return VEC_SVE_DATA
| VEC_STRUCT
;
1236 /* This includes V1DF but not V1DI (which doesn't exist). */
1238 && (known_eq (GET_MODE_BITSIZE (mode
), 64)
1239 || known_eq (GET_MODE_BITSIZE (mode
), 128)))
1246 /* Return true if MODE is any of the data vector modes, including
1249 aarch64_vector_data_mode_p (machine_mode mode
)
1251 return aarch64_classify_vector_mode (mode
) & VEC_ANY_DATA
;
1254 /* Return true if MODE is an SVE data vector mode; either a single vector
1255 or a structure of vectors. */
1257 aarch64_sve_data_mode_p (machine_mode mode
)
1259 return aarch64_classify_vector_mode (mode
) & VEC_SVE_DATA
;
1262 /* Implement target hook TARGET_ARRAY_MODE. */
1263 static opt_machine_mode
1264 aarch64_array_mode (machine_mode mode
, unsigned HOST_WIDE_INT nelems
)
1266 if (aarch64_classify_vector_mode (mode
) == VEC_SVE_DATA
1267 && IN_RANGE (nelems
, 2, 4))
1268 return mode_for_vector (GET_MODE_INNER (mode
),
1269 GET_MODE_NUNITS (mode
) * nelems
);
1271 return opt_machine_mode ();
1274 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1276 aarch64_array_mode_supported_p (machine_mode mode
,
1277 unsigned HOST_WIDE_INT nelems
)
1280 && (AARCH64_VALID_SIMD_QREG_MODE (mode
)
1281 || AARCH64_VALID_SIMD_DREG_MODE (mode
))
1282 && (nelems
>= 2 && nelems
<= 4))
1288 /* Return the SVE predicate mode to use for elements that have
1289 ELEM_NBYTES bytes, if such a mode exists. */
1292 aarch64_sve_pred_mode (unsigned int elem_nbytes
)
1296 if (elem_nbytes
== 1)
1298 if (elem_nbytes
== 2)
1300 if (elem_nbytes
== 4)
1302 if (elem_nbytes
== 8)
1305 return opt_machine_mode ();
1308 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1310 static opt_machine_mode
1311 aarch64_get_mask_mode (poly_uint64 nunits
, poly_uint64 nbytes
)
1313 if (TARGET_SVE
&& known_eq (nbytes
, BYTES_PER_SVE_VECTOR
))
1315 unsigned int elem_nbytes
= vector_element_size (nbytes
, nunits
);
1316 machine_mode pred_mode
;
1317 if (aarch64_sve_pred_mode (elem_nbytes
).exists (&pred_mode
))
1321 return default_get_mask_mode (nunits
, nbytes
);
1324 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1325 prefer to use the first arithmetic operand as the else value if
1326 the else value doesn't matter, since that exactly matches the SVE
1327 destructive merging form. For ternary operations we could either
1328 pick the first operand and use FMAD-like instructions or the last
1329 operand and use FMLA-like instructions; the latter seems more
1333 aarch64_preferred_else_value (unsigned, tree
, unsigned int nops
, tree
*ops
)
1335 return nops
== 3 ? ops
[2] : ops
[0];
1338 /* Implement TARGET_HARD_REGNO_NREGS. */
1341 aarch64_hard_regno_nregs (unsigned regno
, machine_mode mode
)
1343 /* ??? Logically we should only need to provide a value when
1344 HARD_REGNO_MODE_OK says that the combination is valid,
1345 but at the moment we need to handle all modes. Just ignore
1346 any runtime parts for registers that can't store them. */
1347 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
1348 switch (aarch64_regno_regclass (regno
))
1352 if (aarch64_sve_data_mode_p (mode
))
1353 return exact_div (GET_MODE_SIZE (mode
),
1354 BYTES_PER_SVE_VECTOR
).to_constant ();
1355 return CEIL (lowest_size
, UNITS_PER_VREG
);
1361 return CEIL (lowest_size
, UNITS_PER_WORD
);
1366 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1369 aarch64_hard_regno_mode_ok (unsigned regno
, machine_mode mode
)
1371 if (GET_MODE_CLASS (mode
) == MODE_CC
)
1372 return regno
== CC_REGNUM
;
1374 if (regno
== VG_REGNUM
)
1375 /* This must have the same size as _Unwind_Word. */
1376 return mode
== DImode
;
1378 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1379 if (vec_flags
& VEC_SVE_PRED
)
1380 return PR_REGNUM_P (regno
);
1382 if (PR_REGNUM_P (regno
))
1385 if (regno
== SP_REGNUM
)
1386 /* The purpose of comparing with ptr_mode is to support the
1387 global register variable associated with the stack pointer
1388 register via the syntax of asm ("wsp") in ILP32. */
1389 return mode
== Pmode
|| mode
== ptr_mode
;
1391 if (regno
== FRAME_POINTER_REGNUM
|| regno
== ARG_POINTER_REGNUM
)
1392 return mode
== Pmode
;
1394 if (GP_REGNUM_P (regno
) && known_le (GET_MODE_SIZE (mode
), 16))
1397 if (FP_REGNUM_P (regno
))
1399 if (vec_flags
& VEC_STRUCT
)
1400 return end_hard_regno (mode
, regno
) - 1 <= V31_REGNUM
;
1402 return !VECTOR_MODE_P (mode
) || vec_flags
!= 0;
1408 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1409 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1410 clobbers the top 64 bits when restoring the bottom 64 bits. */
1413 aarch64_hard_regno_call_part_clobbered (unsigned int regno
, machine_mode mode
)
1415 return FP_REGNUM_P (regno
) && maybe_gt (GET_MODE_SIZE (mode
), 8);
1418 /* Implement REGMODE_NATURAL_SIZE. */
1420 aarch64_regmode_natural_size (machine_mode mode
)
1422 /* The natural size for SVE data modes is one SVE data vector,
1423 and similarly for predicates. We can't independently modify
1424 anything smaller than that. */
1425 /* ??? For now, only do this for variable-width SVE registers.
1426 Doing it for constant-sized registers breaks lower-subreg.c. */
1427 /* ??? And once that's fixed, we should probably have similar
1428 code for Advanced SIMD. */
1429 if (!aarch64_sve_vg
.is_constant ())
1431 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
1432 if (vec_flags
& VEC_SVE_PRED
)
1433 return BYTES_PER_SVE_PRED
;
1434 if (vec_flags
& VEC_SVE_DATA
)
1435 return BYTES_PER_SVE_VECTOR
;
1437 return UNITS_PER_WORD
;
1440 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1442 aarch64_hard_regno_caller_save_mode (unsigned regno
, unsigned,
1445 /* The predicate mode determines which bits are significant and
1446 which are "don't care". Decreasing the number of lanes would
1447 lose data while increasing the number of lanes would make bits
1448 unnecessarily significant. */
1449 if (PR_REGNUM_P (regno
))
1451 if (known_ge (GET_MODE_SIZE (mode
), 4))
1457 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1458 that strcpy from constants will be faster. */
1460 static HOST_WIDE_INT
1461 aarch64_constant_alignment (const_tree exp
, HOST_WIDE_INT align
)
1463 if (TREE_CODE (exp
) == STRING_CST
&& !optimize_size
)
1464 return MAX (align
, BITS_PER_WORD
);
1468 /* Return true if calls to DECL should be treated as
1469 long-calls (ie called via a register). */
1471 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED
)
1476 /* Return true if calls to symbol-ref SYM should be treated as
1477 long-calls (ie called via a register). */
1479 aarch64_is_long_call_p (rtx sym
)
1481 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym
));
1484 /* Return true if calls to symbol-ref SYM should not go through
1488 aarch64_is_noplt_call_p (rtx sym
)
1490 const_tree decl
= SYMBOL_REF_DECL (sym
);
1495 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl
)))
1496 && !targetm
.binds_local_p (decl
))
1502 /* Return true if the offsets to a zero/sign-extract operation
1503 represent an expression that matches an extend operation. The
1504 operands represent the paramters from
1506 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1508 aarch64_is_extend_from_extract (scalar_int_mode mode
, rtx mult_imm
,
1511 HOST_WIDE_INT mult_val
, extract_val
;
1513 if (! CONST_INT_P (mult_imm
) || ! CONST_INT_P (extract_imm
))
1516 mult_val
= INTVAL (mult_imm
);
1517 extract_val
= INTVAL (extract_imm
);
1520 && extract_val
< GET_MODE_BITSIZE (mode
)
1521 && exact_log2 (extract_val
& ~7) > 0
1522 && (extract_val
& 7) <= 4
1523 && mult_val
== (1 << (extract_val
& 7)))
1529 /* Emit an insn that's a simple single-set. Both the operands must be
1530 known to be valid. */
1531 inline static rtx_insn
*
1532 emit_set_insn (rtx x
, rtx y
)
1534 return emit_insn (gen_rtx_SET (x
, y
));
1537 /* X and Y are two things to compare using CODE. Emit the compare insn and
1538 return the rtx for register 0 in the proper mode. */
1540 aarch64_gen_compare_reg (RTX_CODE code
, rtx x
, rtx y
)
1542 machine_mode mode
= SELECT_CC_MODE (code
, x
, y
);
1543 rtx cc_reg
= gen_rtx_REG (mode
, CC_REGNUM
);
1545 emit_set_insn (cc_reg
, gen_rtx_COMPARE (mode
, x
, y
));
1549 /* Build the SYMBOL_REF for __tls_get_addr. */
1551 static GTY(()) rtx tls_get_addr_libfunc
;
1554 aarch64_tls_get_addr (void)
1556 if (!tls_get_addr_libfunc
)
1557 tls_get_addr_libfunc
= init_one_libfunc ("__tls_get_addr");
1558 return tls_get_addr_libfunc
;
1561 /* Return the TLS model to use for ADDR. */
1563 static enum tls_model
1564 tls_symbolic_operand_type (rtx addr
)
1566 enum tls_model tls_kind
= TLS_MODEL_NONE
;
1567 if (GET_CODE (addr
) == CONST
)
1570 rtx sym
= strip_offset (addr
, &addend
);
1571 if (GET_CODE (sym
) == SYMBOL_REF
)
1572 tls_kind
= SYMBOL_REF_TLS_MODEL (sym
);
1574 else if (GET_CODE (addr
) == SYMBOL_REF
)
1575 tls_kind
= SYMBOL_REF_TLS_MODEL (addr
);
1580 /* We'll allow lo_sum's in addresses in our legitimate addresses
1581 so that combine would take care of combining addresses where
1582 necessary, but for generation purposes, we'll generate the address
1585 tmp = hi (symbol_ref); adrp x1, foo
1586 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1590 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1591 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1595 Load TLS symbol, depending on TLS mechanism and TLS access model.
1597 Global Dynamic - Traditional TLS:
1598 adrp tmp, :tlsgd:imm
1599 add dest, tmp, #:tlsgd_lo12:imm
1602 Global Dynamic - TLS Descriptors:
1603 adrp dest, :tlsdesc:imm
1604 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1605 add dest, dest, #:tlsdesc_lo12:imm
1612 adrp tmp, :gottprel:imm
1613 ldr dest, [tmp, #:gottprel_lo12:imm]
1618 add t0, tp, #:tprel_hi12:imm, lsl #12
1619 add t0, t0, #:tprel_lo12_nc:imm
1623 aarch64_load_symref_appropriately (rtx dest
, rtx imm
,
1624 enum aarch64_symbol_type type
)
1628 case SYMBOL_SMALL_ABSOLUTE
:
1630 /* In ILP32, the mode of dest can be either SImode or DImode. */
1632 machine_mode mode
= GET_MODE (dest
);
1634 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1636 if (can_create_pseudo_p ())
1637 tmp_reg
= gen_reg_rtx (mode
);
1639 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1640 emit_insn (gen_add_losym (dest
, tmp_reg
, imm
));
1644 case SYMBOL_TINY_ABSOLUTE
:
1645 emit_insn (gen_rtx_SET (dest
, imm
));
1648 case SYMBOL_SMALL_GOT_28K
:
1650 machine_mode mode
= GET_MODE (dest
);
1651 rtx gp_rtx
= pic_offset_table_rtx
;
1655 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1656 here before rtl expand. Tree IVOPT will generate rtl pattern to
1657 decide rtx costs, in which case pic_offset_table_rtx is not
1658 initialized. For that case no need to generate the first adrp
1659 instruction as the final cost for global variable access is
1663 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1664 using the page base as GOT base, the first page may be wasted,
1665 in the worst scenario, there is only 28K space for GOT).
1667 The generate instruction sequence for accessing global variable
1670 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1672 Only one instruction needed. But we must initialize
1673 pic_offset_table_rtx properly. We generate initialize insn for
1674 every global access, and allow CSE to remove all redundant.
1676 The final instruction sequences will look like the following
1677 for multiply global variables access.
1679 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1681 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1682 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1683 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1686 rtx s
= gen_rtx_SYMBOL_REF (Pmode
, "_GLOBAL_OFFSET_TABLE_");
1687 crtl
->uses_pic_offset_table
= 1;
1688 emit_move_insn (gp_rtx
, gen_rtx_HIGH (Pmode
, s
));
1690 if (mode
!= GET_MODE (gp_rtx
))
1691 gp_rtx
= gen_lowpart (mode
, gp_rtx
);
1695 if (mode
== ptr_mode
)
1698 insn
= gen_ldr_got_small_28k_di (dest
, gp_rtx
, imm
);
1700 insn
= gen_ldr_got_small_28k_si (dest
, gp_rtx
, imm
);
1702 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1706 gcc_assert (mode
== Pmode
);
1708 insn
= gen_ldr_got_small_28k_sidi (dest
, gp_rtx
, imm
);
1709 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1712 /* The operand is expected to be MEM. Whenever the related insn
1713 pattern changed, above code which calculate mem should be
1715 gcc_assert (GET_CODE (mem
) == MEM
);
1716 MEM_READONLY_P (mem
) = 1;
1717 MEM_NOTRAP_P (mem
) = 1;
1722 case SYMBOL_SMALL_GOT_4G
:
1724 /* In ILP32, the mode of dest can be either SImode or DImode,
1725 while the got entry is always of SImode size. The mode of
1726 dest depends on how dest is used: if dest is assigned to a
1727 pointer (e.g. in the memory), it has SImode; it may have
1728 DImode if dest is dereferenced to access the memeory.
1729 This is why we have to handle three different ldr_got_small
1730 patterns here (two patterns for ILP32). */
1735 machine_mode mode
= GET_MODE (dest
);
1737 if (can_create_pseudo_p ())
1738 tmp_reg
= gen_reg_rtx (mode
);
1740 emit_move_insn (tmp_reg
, gen_rtx_HIGH (mode
, imm
));
1741 if (mode
== ptr_mode
)
1744 insn
= gen_ldr_got_small_di (dest
, tmp_reg
, imm
);
1746 insn
= gen_ldr_got_small_si (dest
, tmp_reg
, imm
);
1748 mem
= XVECEXP (SET_SRC (insn
), 0, 0);
1752 gcc_assert (mode
== Pmode
);
1754 insn
= gen_ldr_got_small_sidi (dest
, tmp_reg
, imm
);
1755 mem
= XVECEXP (XEXP (SET_SRC (insn
), 0), 0, 0);
1758 gcc_assert (GET_CODE (mem
) == MEM
);
1759 MEM_READONLY_P (mem
) = 1;
1760 MEM_NOTRAP_P (mem
) = 1;
1765 case SYMBOL_SMALL_TLSGD
:
1768 machine_mode mode
= GET_MODE (dest
);
1769 rtx result
= gen_rtx_REG (mode
, R0_REGNUM
);
1773 aarch64_emit_call_insn (gen_tlsgd_small_si (result
, imm
));
1775 aarch64_emit_call_insn (gen_tlsgd_small_di (result
, imm
));
1776 insns
= get_insns ();
1779 RTL_CONST_CALL_P (insns
) = 1;
1780 emit_libcall_block (insns
, dest
, result
, imm
);
1784 case SYMBOL_SMALL_TLSDESC
:
1786 machine_mode mode
= GET_MODE (dest
);
1787 rtx x0
= gen_rtx_REG (mode
, R0_REGNUM
);
1790 gcc_assert (mode
== Pmode
|| mode
== ptr_mode
);
1792 /* In ILP32, the got entry is always of SImode size. Unlike
1793 small GOT, the dest is fixed at reg 0. */
1795 emit_insn (gen_tlsdesc_small_si (imm
));
1797 emit_insn (gen_tlsdesc_small_di (imm
));
1798 tp
= aarch64_load_tp (NULL
);
1801 tp
= gen_lowpart (mode
, tp
);
1803 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, x0
)));
1805 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1809 case SYMBOL_SMALL_TLSIE
:
1811 /* In ILP32, the mode of dest can be either SImode or DImode,
1812 while the got entry is always of SImode size. The mode of
1813 dest depends on how dest is used: if dest is assigned to a
1814 pointer (e.g. in the memory), it has SImode; it may have
1815 DImode if dest is dereferenced to access the memeory.
1816 This is why we have to handle three different tlsie_small
1817 patterns here (two patterns for ILP32). */
1818 machine_mode mode
= GET_MODE (dest
);
1819 rtx tmp_reg
= gen_reg_rtx (mode
);
1820 rtx tp
= aarch64_load_tp (NULL
);
1822 if (mode
== ptr_mode
)
1825 emit_insn (gen_tlsie_small_di (tmp_reg
, imm
));
1828 emit_insn (gen_tlsie_small_si (tmp_reg
, imm
));
1829 tp
= gen_lowpart (mode
, tp
);
1834 gcc_assert (mode
== Pmode
);
1835 emit_insn (gen_tlsie_small_sidi (tmp_reg
, imm
));
1838 emit_insn (gen_rtx_SET (dest
, gen_rtx_PLUS (mode
, tp
, tmp_reg
)));
1840 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1844 case SYMBOL_TLSLE12
:
1845 case SYMBOL_TLSLE24
:
1846 case SYMBOL_TLSLE32
:
1847 case SYMBOL_TLSLE48
:
1849 machine_mode mode
= GET_MODE (dest
);
1850 rtx tp
= aarch64_load_tp (NULL
);
1853 tp
= gen_lowpart (mode
, tp
);
1857 case SYMBOL_TLSLE12
:
1858 emit_insn ((mode
== DImode
? gen_tlsle12_di
: gen_tlsle12_si
)
1861 case SYMBOL_TLSLE24
:
1862 emit_insn ((mode
== DImode
? gen_tlsle24_di
: gen_tlsle24_si
)
1865 case SYMBOL_TLSLE32
:
1866 emit_insn ((mode
== DImode
? gen_tlsle32_di
: gen_tlsle32_si
)
1868 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1871 case SYMBOL_TLSLE48
:
1872 emit_insn ((mode
== DImode
? gen_tlsle48_di
: gen_tlsle48_si
)
1874 emit_insn ((mode
== DImode
? gen_adddi3
: gen_addsi3
)
1882 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1886 case SYMBOL_TINY_GOT
:
1887 emit_insn (gen_ldr_got_tiny (dest
, imm
));
1890 case SYMBOL_TINY_TLSIE
:
1892 machine_mode mode
= GET_MODE (dest
);
1893 rtx tp
= aarch64_load_tp (NULL
);
1895 if (mode
== ptr_mode
)
1898 emit_insn (gen_tlsie_tiny_di (dest
, imm
, tp
));
1901 tp
= gen_lowpart (mode
, tp
);
1902 emit_insn (gen_tlsie_tiny_si (dest
, imm
, tp
));
1907 gcc_assert (mode
== Pmode
);
1908 emit_insn (gen_tlsie_tiny_sidi (dest
, imm
, tp
));
1912 set_unique_reg_note (get_last_insn (), REG_EQUIV
, imm
);
1921 /* Emit a move from SRC to DEST. Assume that the move expanders can
1922 handle all moves if !can_create_pseudo_p (). The distinction is
1923 important because, unlike emit_move_insn, the move expanders know
1924 how to force Pmode objects into the constant pool even when the
1925 constant pool address is not itself legitimate. */
1927 aarch64_emit_move (rtx dest
, rtx src
)
1929 return (can_create_pseudo_p ()
1930 ? emit_move_insn (dest
, src
)
1931 : emit_move_insn_1 (dest
, src
));
1934 /* Apply UNOPTAB to OP and store the result in DEST. */
1937 aarch64_emit_unop (rtx dest
, optab unoptab
, rtx op
)
1939 rtx tmp
= expand_unop (GET_MODE (dest
), unoptab
, op
, dest
, 0);
1941 emit_move_insn (dest
, tmp
);
1944 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1947 aarch64_emit_binop (rtx dest
, optab binoptab
, rtx op0
, rtx op1
)
1949 rtx tmp
= expand_binop (GET_MODE (dest
), binoptab
, op0
, op1
, dest
, 0,
1952 emit_move_insn (dest
, tmp
);
1955 /* Split a 128-bit move operation into two 64-bit move operations,
1956 taking care to handle partial overlap of register to register
1957 copies. Special cases are needed when moving between GP regs and
1958 FP regs. SRC can be a register, constant or memory; DST a register
1959 or memory. If either operand is memory it must not have any side
1962 aarch64_split_128bit_move (rtx dst
, rtx src
)
1967 machine_mode mode
= GET_MODE (dst
);
1969 gcc_assert (mode
== TImode
|| mode
== TFmode
);
1970 gcc_assert (!(side_effects_p (src
) || side_effects_p (dst
)));
1971 gcc_assert (mode
== GET_MODE (src
) || GET_MODE (src
) == VOIDmode
);
1973 if (REG_P (dst
) && REG_P (src
))
1975 int src_regno
= REGNO (src
);
1976 int dst_regno
= REGNO (dst
);
1978 /* Handle FP <-> GP regs. */
1979 if (FP_REGNUM_P (dst_regno
) && GP_REGNUM_P (src_regno
))
1981 src_lo
= gen_lowpart (word_mode
, src
);
1982 src_hi
= gen_highpart (word_mode
, src
);
1984 emit_insn (gen_aarch64_movlow_di (mode
, dst
, src_lo
));
1985 emit_insn (gen_aarch64_movhigh_di (mode
, dst
, src_hi
));
1988 else if (GP_REGNUM_P (dst_regno
) && FP_REGNUM_P (src_regno
))
1990 dst_lo
= gen_lowpart (word_mode
, dst
);
1991 dst_hi
= gen_highpart (word_mode
, dst
);
1993 emit_insn (gen_aarch64_movdi_low (mode
, dst_lo
, src
));
1994 emit_insn (gen_aarch64_movdi_high (mode
, dst_hi
, src
));
1999 dst_lo
= gen_lowpart (word_mode
, dst
);
2000 dst_hi
= gen_highpart (word_mode
, dst
);
2001 src_lo
= gen_lowpart (word_mode
, src
);
2002 src_hi
= gen_highpart_mode (word_mode
, mode
, src
);
2004 /* At most one pairing may overlap. */
2005 if (reg_overlap_mentioned_p (dst_lo
, src_hi
))
2007 aarch64_emit_move (dst_hi
, src_hi
);
2008 aarch64_emit_move (dst_lo
, src_lo
);
2012 aarch64_emit_move (dst_lo
, src_lo
);
2013 aarch64_emit_move (dst_hi
, src_hi
);
2018 aarch64_split_128bit_move_p (rtx dst
, rtx src
)
2020 return (! REG_P (src
)
2021 || ! (FP_REGNUM_P (REGNO (dst
)) && FP_REGNUM_P (REGNO (src
))));
2024 /* Split a complex SIMD combine. */
2027 aarch64_split_simd_combine (rtx dst
, rtx src1
, rtx src2
)
2029 machine_mode src_mode
= GET_MODE (src1
);
2030 machine_mode dst_mode
= GET_MODE (dst
);
2032 gcc_assert (VECTOR_MODE_P (dst_mode
));
2033 gcc_assert (register_operand (dst
, dst_mode
)
2034 && register_operand (src1
, src_mode
)
2035 && register_operand (src2
, src_mode
));
2037 emit_insn (gen_aarch64_simd_combine (src_mode
, dst
, src1
, src2
));
2041 /* Split a complex SIMD move. */
2044 aarch64_split_simd_move (rtx dst
, rtx src
)
2046 machine_mode src_mode
= GET_MODE (src
);
2047 machine_mode dst_mode
= GET_MODE (dst
);
2049 gcc_assert (VECTOR_MODE_P (dst_mode
));
2051 if (REG_P (dst
) && REG_P (src
))
2053 gcc_assert (VECTOR_MODE_P (src_mode
));
2054 emit_insn (gen_aarch64_split_simd_mov (src_mode
, dst
, src
));
2059 aarch64_zero_extend_const_eq (machine_mode xmode
, rtx x
,
2060 machine_mode ymode
, rtx y
)
2062 rtx r
= simplify_const_unary_operation (ZERO_EXTEND
, xmode
, y
, ymode
);
2063 gcc_assert (r
!= NULL
);
2064 return rtx_equal_p (x
, r
);
2069 aarch64_force_temporary (machine_mode mode
, rtx x
, rtx value
)
2071 if (can_create_pseudo_p ())
2072 return force_reg (mode
, value
);
2076 aarch64_emit_move (x
, value
);
2081 /* Return true if we can move VALUE into a register using a single
2082 CNT[BHWD] instruction. */
2085 aarch64_sve_cnt_immediate_p (poly_int64 value
)
2087 HOST_WIDE_INT factor
= value
.coeffs
[0];
2088 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2089 return (value
.coeffs
[1] == factor
2090 && IN_RANGE (factor
, 2, 16 * 16)
2091 && (factor
& 1) == 0
2092 && factor
<= 16 * (factor
& -factor
));
2095 /* Likewise for rtx X. */
2098 aarch64_sve_cnt_immediate_p (rtx x
)
2101 return poly_int_rtx_p (x
, &value
) && aarch64_sve_cnt_immediate_p (value
);
2104 /* Return the asm string for an instruction with a CNT-like vector size
2105 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2106 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2107 first part of the operands template (the part that comes before the
2108 vector size itself). FACTOR is the number of quadwords.
2109 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2110 If it is zero, we can use any element size. */
2113 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2114 unsigned int factor
,
2115 unsigned int nelts_per_vq
)
2117 static char buffer
[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2119 if (nelts_per_vq
== 0)
2120 /* There is some overlap in the ranges of the four CNT instructions.
2121 Here we always use the smallest possible element size, so that the
2122 multiplier is 1 whereever possible. */
2123 nelts_per_vq
= factor
& -factor
;
2124 int shift
= std::min (exact_log2 (nelts_per_vq
), 4);
2125 gcc_assert (IN_RANGE (shift
, 1, 4));
2126 char suffix
= "dwhb"[shift
- 1];
2129 unsigned int written
;
2131 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s",
2132 prefix
, suffix
, operands
);
2134 written
= snprintf (buffer
, sizeof (buffer
), "%s%c\t%s, all, mul #%d",
2135 prefix
, suffix
, operands
, factor
);
2136 gcc_assert (written
< sizeof (buffer
));
2140 /* Return the asm string for an instruction with a CNT-like vector size
2141 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2142 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2143 first part of the operands template (the part that comes before the
2144 vector size itself). X is the value of the vector size operand,
2145 as a polynomial integer rtx. */
2148 aarch64_output_sve_cnt_immediate (const char *prefix
, const char *operands
,
2151 poly_int64 value
= rtx_to_poly_int64 (x
);
2152 gcc_assert (aarch64_sve_cnt_immediate_p (value
));
2153 return aarch64_output_sve_cnt_immediate (prefix
, operands
,
2154 value
.coeffs
[1], 0);
2157 /* Return true if we can add VALUE to a register using a single ADDVL
2158 or ADDPL instruction. */
2161 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value
)
2163 HOST_WIDE_INT factor
= value
.coeffs
[0];
2164 if (factor
== 0 || value
.coeffs
[1] != factor
)
2166 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2167 and a value of 16 is one vector width. */
2168 return (((factor
& 15) == 0 && IN_RANGE (factor
, -32 * 16, 31 * 16))
2169 || ((factor
& 1) == 0 && IN_RANGE (factor
, -32 * 2, 31 * 2)));
2172 /* Likewise for rtx X. */
2175 aarch64_sve_addvl_addpl_immediate_p (rtx x
)
2178 return (poly_int_rtx_p (x
, &value
)
2179 && aarch64_sve_addvl_addpl_immediate_p (value
));
2182 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2183 and storing the result in operand 0. */
2186 aarch64_output_sve_addvl_addpl (rtx dest
, rtx base
, rtx offset
)
2188 static char buffer
[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2189 poly_int64 offset_value
= rtx_to_poly_int64 (offset
);
2190 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value
));
2192 /* Use INC or DEC if possible. */
2193 if (rtx_equal_p (dest
, base
) && GP_REGNUM_P (REGNO (dest
)))
2195 if (aarch64_sve_cnt_immediate_p (offset_value
))
2196 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2197 offset_value
.coeffs
[1], 0);
2198 if (aarch64_sve_cnt_immediate_p (-offset_value
))
2199 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2200 -offset_value
.coeffs
[1], 0);
2203 int factor
= offset_value
.coeffs
[1];
2204 if ((factor
& 15) == 0)
2205 snprintf (buffer
, sizeof (buffer
), "addvl\t%%x0, %%x1, #%d", factor
/ 16);
2207 snprintf (buffer
, sizeof (buffer
), "addpl\t%%x0, %%x1, #%d", factor
/ 2);
2211 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2212 instruction. If it is, store the number of elements in each vector
2213 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2214 factor in *FACTOR_OUT (if nonnull). */
2217 aarch64_sve_inc_dec_immediate_p (rtx x
, int *factor_out
,
2218 unsigned int *nelts_per_vq_out
)
2223 if (!const_vec_duplicate_p (x
, &elt
)
2224 || !poly_int_rtx_p (elt
, &value
))
2227 unsigned int nelts_per_vq
= 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x
));
2228 if (nelts_per_vq
!= 8 && nelts_per_vq
!= 4 && nelts_per_vq
!= 2)
2229 /* There's no vector INCB. */
2232 HOST_WIDE_INT factor
= value
.coeffs
[0];
2233 if (value
.coeffs
[1] != factor
)
2236 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2237 if ((factor
% nelts_per_vq
) != 0
2238 || !IN_RANGE (abs (factor
), nelts_per_vq
, 16 * nelts_per_vq
))
2242 *factor_out
= factor
;
2243 if (nelts_per_vq_out
)
2244 *nelts_per_vq_out
= nelts_per_vq
;
2248 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2252 aarch64_sve_inc_dec_immediate_p (rtx x
)
2254 return aarch64_sve_inc_dec_immediate_p (x
, NULL
, NULL
);
2257 /* Return the asm template for an SVE vector INC or DEC instruction.
2258 OPERANDS gives the operands before the vector count and X is the
2259 value of the vector count operand itself. */
2262 aarch64_output_sve_inc_dec_immediate (const char *operands
, rtx x
)
2265 unsigned int nelts_per_vq
;
2266 if (!aarch64_sve_inc_dec_immediate_p (x
, &factor
, &nelts_per_vq
))
2269 return aarch64_output_sve_cnt_immediate ("dec", operands
, -factor
,
2272 return aarch64_output_sve_cnt_immediate ("inc", operands
, factor
,
2277 aarch64_internal_mov_immediate (rtx dest
, rtx imm
, bool generate
,
2278 scalar_int_mode mode
)
2281 unsigned HOST_WIDE_INT val
, val2
, mask
;
2282 int one_match
, zero_match
;
2287 if (aarch64_move_imm (val
, mode
))
2290 emit_insn (gen_rtx_SET (dest
, imm
));
2294 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2295 (with XXXX non-zero). In that case check to see if the move can be done in
2297 val2
= val
& 0xffffffff;
2299 && aarch64_move_imm (val2
, SImode
)
2300 && (((val
>> 32) & 0xffff) == 0 || (val
>> 48) == 0))
2303 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2305 /* Check if we have to emit a second instruction by checking to see
2306 if any of the upper 32 bits of the original DI mode value is set. */
2310 i
= (val
>> 48) ? 48 : 32;
2313 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2314 GEN_INT ((val
>> i
) & 0xffff)));
2319 if ((val
>> 32) == 0 || mode
== SImode
)
2323 emit_insn (gen_rtx_SET (dest
, GEN_INT (val
& 0xffff)));
2325 emit_insn (gen_insv_immsi (dest
, GEN_INT (16),
2326 GEN_INT ((val
>> 16) & 0xffff)));
2328 emit_insn (gen_insv_immdi (dest
, GEN_INT (16),
2329 GEN_INT ((val
>> 16) & 0xffff)));
2334 /* Remaining cases are all for DImode. */
2337 zero_match
= ((val
& mask
) == 0) + ((val
& (mask
<< 16)) == 0) +
2338 ((val
& (mask
<< 32)) == 0) + ((val
& (mask
<< 48)) == 0);
2339 one_match
= ((~val
& mask
) == 0) + ((~val
& (mask
<< 16)) == 0) +
2340 ((~val
& (mask
<< 32)) == 0) + ((~val
& (mask
<< 48)) == 0);
2342 if (zero_match
!= 2 && one_match
!= 2)
2344 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2345 For a 64-bit bitmask try whether changing 16 bits to all ones or
2346 zeroes creates a valid bitmask. To check any repeated bitmask,
2347 try using 16 bits from the other 32-bit half of val. */
2349 for (i
= 0; i
< 64; i
+= 16, mask
<<= 16)
2352 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2355 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2357 val2
= val2
& ~mask
;
2358 val2
= val2
| (((val2
>> 32) | (val2
<< 32)) & mask
);
2359 if (val2
!= val
&& aarch64_bitmask_imm (val2
, mode
))
2366 emit_insn (gen_rtx_SET (dest
, GEN_INT (val2
)));
2367 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2368 GEN_INT ((val
>> i
) & 0xffff)));
2374 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2375 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2376 otherwise skip zero bits. */
2380 val2
= one_match
> zero_match
? ~val
: val
;
2381 i
= (val2
& mask
) != 0 ? 0 : (val2
& (mask
<< 16)) != 0 ? 16 : 32;
2384 emit_insn (gen_rtx_SET (dest
, GEN_INT (one_match
> zero_match
2385 ? (val
| ~(mask
<< i
))
2386 : (val
& (mask
<< i
)))));
2387 for (i
+= 16; i
< 64; i
+= 16)
2389 if ((val2
& (mask
<< i
)) == 0)
2392 emit_insn (gen_insv_immdi (dest
, GEN_INT (i
),
2393 GEN_INT ((val
>> i
) & 0xffff)));
2400 /* Return whether imm is a 128-bit immediate which is simple enough to
2403 aarch64_mov128_immediate (rtx imm
)
2405 if (GET_CODE (imm
) == CONST_INT
)
2408 gcc_assert (CONST_WIDE_INT_NUNITS (imm
) == 2);
2410 rtx lo
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 0));
2411 rtx hi
= GEN_INT (CONST_WIDE_INT_ELT (imm
, 1));
2413 return aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, DImode
)
2414 + aarch64_internal_mov_immediate (NULL_RTX
, hi
, false, DImode
) <= 4;
2418 /* Return the number of temporary registers that aarch64_add_offset_1
2419 would need to add OFFSET to a register. */
2422 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset
)
2424 return abs_hwi (offset
) < 0x1000000 ? 0 : 1;
2427 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2428 a non-polynomial OFFSET. MODE is the mode of the addition.
2429 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2430 be set and CFA adjustments added to the generated instructions.
2432 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2433 temporary if register allocation is already complete. This temporary
2434 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2435 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2436 the immediate again.
2438 Since this function may be used to adjust the stack pointer, we must
2439 ensure that it cannot cause transient stack deallocation (for example
2440 by first incrementing SP and then decrementing when adjusting by a
2441 large immediate). */
2444 aarch64_add_offset_1 (scalar_int_mode mode
, rtx dest
,
2445 rtx src
, HOST_WIDE_INT offset
, rtx temp1
,
2446 bool frame_related_p
, bool emit_move_imm
)
2448 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2449 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2451 HOST_WIDE_INT moffset
= abs_hwi (offset
);
2456 if (!rtx_equal_p (dest
, src
))
2458 insn
= emit_insn (gen_rtx_SET (dest
, src
));
2459 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2464 /* Single instruction adjustment. */
2465 if (aarch64_uimm12_shift (moffset
))
2467 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (offset
)));
2468 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2472 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2475 a) the offset cannot be loaded by a 16-bit move or
2476 b) there is no spare register into which we can move it. */
2477 if (moffset
< 0x1000000
2478 && ((!temp1
&& !can_create_pseudo_p ())
2479 || !aarch64_move_imm (moffset
, mode
)))
2481 HOST_WIDE_INT low_off
= moffset
& 0xfff;
2483 low_off
= offset
< 0 ? -low_off
: low_off
;
2484 insn
= emit_insn (gen_add3_insn (dest
, src
, GEN_INT (low_off
)));
2485 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2486 insn
= emit_insn (gen_add2_insn (dest
, GEN_INT (offset
- low_off
)));
2487 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2491 /* Emit a move immediate if required and an addition/subtraction. */
2494 gcc_assert (temp1
!= NULL_RTX
|| can_create_pseudo_p ());
2495 temp1
= aarch64_force_temporary (mode
, temp1
, GEN_INT (moffset
));
2497 insn
= emit_insn (offset
< 0
2498 ? gen_sub3_insn (dest
, src
, temp1
)
2499 : gen_add3_insn (dest
, src
, temp1
));
2500 if (frame_related_p
)
2502 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2503 rtx adj
= plus_constant (mode
, src
, offset
);
2504 add_reg_note (insn
, REG_CFA_ADJUST_CFA
, gen_rtx_SET (dest
, adj
));
2508 /* Return the number of temporary registers that aarch64_add_offset
2509 would need to move OFFSET into a register or add OFFSET to a register;
2510 ADD_P is true if we want the latter rather than the former. */
2513 aarch64_offset_temporaries (bool add_p
, poly_int64 offset
)
2515 /* This follows the same structure as aarch64_add_offset. */
2516 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2519 unsigned int count
= 0;
2520 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2521 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2522 poly_int64
poly_offset (factor
, factor
);
2523 if (add_p
&& aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2524 /* Need one register for the ADDVL/ADDPL result. */
2526 else if (factor
!= 0)
2528 factor
= abs (factor
);
2529 if (factor
> 16 * (factor
& -factor
))
2530 /* Need one register for the CNT result and one for the multiplication
2531 factor. If necessary, the second temporary can be reused for the
2532 constant part of the offset. */
2534 /* Need one register for the CNT result (which might then
2538 return count
+ aarch64_add_offset_1_temporaries (constant
);
2541 /* If X can be represented as a poly_int64, return the number
2542 of temporaries that are required to add it to a register.
2543 Return -1 otherwise. */
2546 aarch64_add_offset_temporaries (rtx x
)
2549 if (!poly_int_rtx_p (x
, &offset
))
2551 return aarch64_offset_temporaries (true, offset
);
2554 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2555 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2556 be set and CFA adjustments added to the generated instructions.
2558 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2559 temporary if register allocation is already complete. This temporary
2560 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2561 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2562 false to avoid emitting the immediate again.
2564 TEMP2, if nonnull, is a second temporary register that doesn't
2565 overlap either DEST or REG.
2567 Since this function may be used to adjust the stack pointer, we must
2568 ensure that it cannot cause transient stack deallocation (for example
2569 by first incrementing SP and then decrementing when adjusting by a
2570 large immediate). */
2573 aarch64_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2574 poly_int64 offset
, rtx temp1
, rtx temp2
,
2575 bool frame_related_p
, bool emit_move_imm
= true)
2577 gcc_assert (emit_move_imm
|| temp1
!= NULL_RTX
);
2578 gcc_assert (temp1
== NULL_RTX
|| !reg_overlap_mentioned_p (temp1
, src
));
2579 gcc_assert (temp1
== NULL_RTX
2581 || !reg_overlap_mentioned_p (temp1
, dest
));
2582 gcc_assert (temp2
== NULL_RTX
|| !reg_overlap_mentioned_p (dest
, temp2
));
2584 /* Try using ADDVL or ADDPL to add the whole value. */
2585 if (src
!= const0_rtx
&& aarch64_sve_addvl_addpl_immediate_p (offset
))
2587 rtx offset_rtx
= gen_int_mode (offset
, mode
);
2588 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2589 RTX_FRAME_RELATED_P (insn
) = frame_related_p
;
2593 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2594 SVE vector register, over and above the minimum size of 128 bits.
2595 This is equivalent to half the value returned by CNTD with a
2596 vector shape of ALL. */
2597 HOST_WIDE_INT factor
= offset
.coeffs
[1];
2598 HOST_WIDE_INT constant
= offset
.coeffs
[0] - factor
;
2600 /* Try using ADDVL or ADDPL to add the VG-based part. */
2601 poly_int64
poly_offset (factor
, factor
);
2602 if (src
!= const0_rtx
2603 && aarch64_sve_addvl_addpl_immediate_p (poly_offset
))
2605 rtx offset_rtx
= gen_int_mode (poly_offset
, mode
);
2606 if (frame_related_p
)
2608 rtx_insn
*insn
= emit_insn (gen_add3_insn (dest
, src
, offset_rtx
));
2609 RTX_FRAME_RELATED_P (insn
) = true;
2614 rtx addr
= gen_rtx_PLUS (mode
, src
, offset_rtx
);
2615 src
= aarch64_force_temporary (mode
, temp1
, addr
);
2620 /* Otherwise use a CNT-based sequence. */
2621 else if (factor
!= 0)
2623 /* Use a subtraction if we have a negative factor. */
2624 rtx_code code
= PLUS
;
2631 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2632 into the multiplication. */
2636 /* Use a right shift by 1. */
2640 HOST_WIDE_INT low_bit
= factor
& -factor
;
2641 if (factor
<= 16 * low_bit
)
2643 if (factor
> 16 * 8)
2645 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2646 the value with the minimum multiplier and shift it into
2648 int extra_shift
= exact_log2 (low_bit
);
2649 shift
+= extra_shift
;
2650 factor
>>= extra_shift
;
2652 val
= gen_int_mode (poly_int64 (factor
* 2, factor
* 2), mode
);
2656 /* Use CNTD, then multiply it by FACTOR. */
2657 val
= gen_int_mode (poly_int64 (2, 2), mode
);
2658 val
= aarch64_force_temporary (mode
, temp1
, val
);
2660 /* Go back to using a negative multiplication factor if we have
2661 no register from which to subtract. */
2662 if (code
== MINUS
&& src
== const0_rtx
)
2667 rtx coeff1
= gen_int_mode (factor
, mode
);
2668 coeff1
= aarch64_force_temporary (mode
, temp2
, coeff1
);
2669 val
= gen_rtx_MULT (mode
, val
, coeff1
);
2674 /* Multiply by 1 << SHIFT. */
2675 val
= aarch64_force_temporary (mode
, temp1
, val
);
2676 val
= gen_rtx_ASHIFT (mode
, val
, GEN_INT (shift
));
2678 else if (shift
== -1)
2681 val
= aarch64_force_temporary (mode
, temp1
, val
);
2682 val
= gen_rtx_ASHIFTRT (mode
, val
, const1_rtx
);
2685 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2686 if (src
!= const0_rtx
)
2688 val
= aarch64_force_temporary (mode
, temp1
, val
);
2689 val
= gen_rtx_fmt_ee (code
, mode
, src
, val
);
2691 else if (code
== MINUS
)
2693 val
= aarch64_force_temporary (mode
, temp1
, val
);
2694 val
= gen_rtx_NEG (mode
, val
);
2697 if (constant
== 0 || frame_related_p
)
2699 rtx_insn
*insn
= emit_insn (gen_rtx_SET (dest
, val
));
2700 if (frame_related_p
)
2702 RTX_FRAME_RELATED_P (insn
) = true;
2703 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
2704 gen_rtx_SET (dest
, plus_constant (Pmode
, src
,
2713 src
= aarch64_force_temporary (mode
, temp1
, val
);
2718 emit_move_imm
= true;
2721 aarch64_add_offset_1 (mode
, dest
, src
, constant
, temp1
,
2722 frame_related_p
, emit_move_imm
);
2725 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2726 than a poly_int64. */
2729 aarch64_split_add_offset (scalar_int_mode mode
, rtx dest
, rtx src
,
2730 rtx offset_rtx
, rtx temp1
, rtx temp2
)
2732 aarch64_add_offset (mode
, dest
, src
, rtx_to_poly_int64 (offset_rtx
),
2733 temp1
, temp2
, false);
2736 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2737 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2738 if TEMP1 already contains abs (DELTA). */
2741 aarch64_add_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool emit_move_imm
)
2743 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, delta
,
2744 temp1
, temp2
, true, emit_move_imm
);
2747 /* Subtract DELTA from the stack pointer, marking the instructions
2748 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2752 aarch64_sub_sp (rtx temp1
, rtx temp2
, poly_int64 delta
, bool frame_related_p
)
2754 aarch64_add_offset (Pmode
, stack_pointer_rtx
, stack_pointer_rtx
, -delta
,
2755 temp1
, temp2
, frame_related_p
);
2758 /* Set DEST to (vec_series BASE STEP). */
2761 aarch64_expand_vec_series (rtx dest
, rtx base
, rtx step
)
2763 machine_mode mode
= GET_MODE (dest
);
2764 scalar_mode inner
= GET_MODE_INNER (mode
);
2766 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2767 if (!aarch64_sve_index_immediate_p (base
))
2768 base
= force_reg (inner
, base
);
2769 if (!aarch64_sve_index_immediate_p (step
))
2770 step
= force_reg (inner
, step
);
2772 emit_set_insn (dest
, gen_rtx_VEC_SERIES (mode
, base
, step
));
2775 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2776 integer of mode INT_MODE. Return true on success. */
2779 aarch64_expand_sve_widened_duplicate (rtx dest
, scalar_int_mode src_mode
,
2782 /* If the constant is smaller than 128 bits, we can do the move
2783 using a vector of SRC_MODEs. */
2784 if (src_mode
!= TImode
)
2786 poly_uint64 count
= exact_div (GET_MODE_SIZE (GET_MODE (dest
)),
2787 GET_MODE_SIZE (src_mode
));
2788 machine_mode dup_mode
= mode_for_vector (src_mode
, count
).require ();
2789 emit_move_insn (gen_lowpart (dup_mode
, dest
),
2790 gen_const_vec_duplicate (dup_mode
, src
));
2794 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2795 src
= force_const_mem (src_mode
, src
);
2799 /* Make sure that the address is legitimate. */
2800 if (!aarch64_sve_ld1r_operand_p (src
))
2802 rtx addr
= force_reg (Pmode
, XEXP (src
, 0));
2803 src
= replace_equiv_address (src
, addr
);
2806 machine_mode mode
= GET_MODE (dest
);
2807 unsigned int elem_bytes
= GET_MODE_UNIT_SIZE (mode
);
2808 machine_mode pred_mode
= aarch64_sve_pred_mode (elem_bytes
).require ();
2809 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
2810 src
= gen_rtx_UNSPEC (mode
, gen_rtvec (2, ptrue
, src
), UNSPEC_LD1RQ
);
2811 emit_insn (gen_rtx_SET (dest
, src
));
2815 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2816 isn't a simple duplicate or series. */
2819 aarch64_expand_sve_const_vector (rtx dest
, rtx src
)
2821 machine_mode mode
= GET_MODE (src
);
2822 unsigned int npatterns
= CONST_VECTOR_NPATTERNS (src
);
2823 unsigned int nelts_per_pattern
= CONST_VECTOR_NELTS_PER_PATTERN (src
);
2824 gcc_assert (npatterns
> 1);
2826 if (nelts_per_pattern
== 1)
2828 /* The constant is a repeating seqeuence of at least two elements,
2829 where the repeating elements occupy no more than 128 bits.
2830 Get an integer representation of the replicated value. */
2831 scalar_int_mode int_mode
;
2832 if (BYTES_BIG_ENDIAN
)
2833 /* For now, always use LD1RQ to load the value on big-endian
2834 targets, since the handling of smaller integers includes a
2835 subreg that is semantically an element reverse. */
2839 unsigned int int_bits
= GET_MODE_UNIT_BITSIZE (mode
) * npatterns
;
2840 gcc_assert (int_bits
<= 128);
2841 int_mode
= int_mode_for_size (int_bits
, 0).require ();
2843 rtx int_value
= simplify_gen_subreg (int_mode
, src
, mode
, 0);
2845 && aarch64_expand_sve_widened_duplicate (dest
, int_mode
, int_value
))
2849 /* Expand each pattern individually. */
2850 rtx_vector_builder builder
;
2851 auto_vec
<rtx
, 16> vectors (npatterns
);
2852 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2854 builder
.new_vector (mode
, 1, nelts_per_pattern
);
2855 for (unsigned int j
= 0; j
< nelts_per_pattern
; ++j
)
2856 builder
.quick_push (CONST_VECTOR_ELT (src
, i
+ j
* npatterns
));
2857 vectors
.quick_push (force_reg (mode
, builder
.build ()));
2860 /* Use permutes to interleave the separate vectors. */
2861 while (npatterns
> 1)
2864 for (unsigned int i
= 0; i
< npatterns
; ++i
)
2866 rtx tmp
= (npatterns
== 1 ? dest
: gen_reg_rtx (mode
));
2867 rtvec v
= gen_rtvec (2, vectors
[i
], vectors
[i
+ npatterns
]);
2868 emit_set_insn (tmp
, gen_rtx_UNSPEC (mode
, v
, UNSPEC_ZIP1
));
2872 gcc_assert (vectors
[0] == dest
);
2875 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2876 is a pattern that can be used to set DEST to a replicated scalar
2880 aarch64_expand_mov_immediate (rtx dest
, rtx imm
,
2881 rtx (*gen_vec_duplicate
) (rtx
, rtx
))
2883 machine_mode mode
= GET_MODE (dest
);
2885 /* Check on what type of symbol it is. */
2886 scalar_int_mode int_mode
;
2887 if ((GET_CODE (imm
) == SYMBOL_REF
2888 || GET_CODE (imm
) == LABEL_REF
2889 || GET_CODE (imm
) == CONST
2890 || GET_CODE (imm
) == CONST_POLY_INT
)
2891 && is_a
<scalar_int_mode
> (mode
, &int_mode
))
2895 HOST_WIDE_INT const_offset
;
2896 enum aarch64_symbol_type sty
;
2898 /* If we have (const (plus symbol offset)), separate out the offset
2899 before we start classifying the symbol. */
2900 rtx base
= strip_offset (imm
, &offset
);
2902 /* We must always add an offset involving VL separately, rather than
2903 folding it into the relocation. */
2904 if (!offset
.is_constant (&const_offset
))
2906 if (base
== const0_rtx
&& aarch64_sve_cnt_immediate_p (offset
))
2907 emit_insn (gen_rtx_SET (dest
, imm
));
2910 /* Do arithmetic on 32-bit values if the result is smaller
2912 if (partial_subreg_p (int_mode
, SImode
))
2914 /* It is invalid to do symbol calculations in modes
2915 narrower than SImode. */
2916 gcc_assert (base
== const0_rtx
);
2917 dest
= gen_lowpart (SImode
, dest
);
2920 if (base
!= const0_rtx
)
2922 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2923 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2924 NULL_RTX
, NULL_RTX
, false);
2927 aarch64_add_offset (int_mode
, dest
, base
, offset
,
2928 dest
, NULL_RTX
, false);
2933 sty
= aarch64_classify_symbol (base
, const_offset
);
2936 case SYMBOL_FORCE_TO_MEM
:
2937 if (const_offset
!= 0
2938 && targetm
.cannot_force_const_mem (int_mode
, imm
))
2940 gcc_assert (can_create_pseudo_p ());
2941 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2942 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
2943 NULL_RTX
, NULL_RTX
, false);
2947 mem
= force_const_mem (ptr_mode
, imm
);
2950 /* If we aren't generating PC relative literals, then
2951 we need to expand the literal pool access carefully.
2952 This is something that needs to be done in a number
2953 of places, so could well live as a separate function. */
2954 if (!aarch64_pcrelative_literal_loads
)
2956 gcc_assert (can_create_pseudo_p ());
2957 base
= gen_reg_rtx (ptr_mode
);
2958 aarch64_expand_mov_immediate (base
, XEXP (mem
, 0));
2959 if (ptr_mode
!= Pmode
)
2960 base
= convert_memory_address (Pmode
, base
);
2961 mem
= gen_rtx_MEM (ptr_mode
, base
);
2964 if (int_mode
!= ptr_mode
)
2965 mem
= gen_rtx_ZERO_EXTEND (int_mode
, mem
);
2967 emit_insn (gen_rtx_SET (dest
, mem
));
2971 case SYMBOL_SMALL_TLSGD
:
2972 case SYMBOL_SMALL_TLSDESC
:
2973 case SYMBOL_SMALL_TLSIE
:
2974 case SYMBOL_SMALL_GOT_28K
:
2975 case SYMBOL_SMALL_GOT_4G
:
2976 case SYMBOL_TINY_GOT
:
2977 case SYMBOL_TINY_TLSIE
:
2978 if (const_offset
!= 0)
2980 gcc_assert(can_create_pseudo_p ());
2981 base
= aarch64_force_temporary (int_mode
, dest
, base
);
2982 aarch64_add_offset (int_mode
, dest
, base
, const_offset
,
2983 NULL_RTX
, NULL_RTX
, false);
2988 case SYMBOL_SMALL_ABSOLUTE
:
2989 case SYMBOL_TINY_ABSOLUTE
:
2990 case SYMBOL_TLSLE12
:
2991 case SYMBOL_TLSLE24
:
2992 case SYMBOL_TLSLE32
:
2993 case SYMBOL_TLSLE48
:
2994 aarch64_load_symref_appropriately (dest
, imm
, sty
);
3002 if (!CONST_INT_P (imm
))
3004 rtx base
, step
, value
;
3005 if (GET_CODE (imm
) == HIGH
3006 || aarch64_simd_valid_immediate (imm
, NULL
))
3007 emit_insn (gen_rtx_SET (dest
, imm
));
3008 else if (const_vec_series_p (imm
, &base
, &step
))
3009 aarch64_expand_vec_series (dest
, base
, step
);
3010 else if (const_vec_duplicate_p (imm
, &value
))
3012 /* If the constant is out of range of an SVE vector move,
3013 load it from memory if we can, otherwise move it into
3014 a register and use a DUP. */
3015 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
3016 rtx op
= force_const_mem (inner_mode
, value
);
3018 op
= force_reg (inner_mode
, value
);
3019 else if (!aarch64_sve_ld1r_operand_p (op
))
3021 rtx addr
= force_reg (Pmode
, XEXP (op
, 0));
3022 op
= replace_equiv_address (op
, addr
);
3024 emit_insn (gen_vec_duplicate (dest
, op
));
3026 else if (GET_CODE (imm
) == CONST_VECTOR
3027 && !GET_MODE_NUNITS (GET_MODE (imm
)).is_constant ())
3028 aarch64_expand_sve_const_vector (dest
, imm
);
3031 rtx mem
= force_const_mem (mode
, imm
);
3033 emit_move_insn (dest
, mem
);
3039 aarch64_internal_mov_immediate (dest
, imm
, true,
3040 as_a
<scalar_int_mode
> (mode
));
3043 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3044 that is known to contain PTRUE. */
3047 aarch64_emit_sve_pred_move (rtx dest
, rtx pred
, rtx src
)
3049 emit_insn (gen_rtx_SET (dest
, gen_rtx_UNSPEC (GET_MODE (dest
),
3050 gen_rtvec (2, pred
, src
),
3051 UNSPEC_MERGE_PTRUE
)));
3054 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3055 operand is in memory. In this case we need to use the predicated LD1
3056 and ST1 instead of LDR and STR, both for correctness on big-endian
3057 targets and because LD1 and ST1 support a wider range of addressing modes.
3058 PRED_MODE is the mode of the predicate.
3060 See the comment at the head of aarch64-sve.md for details about the
3061 big-endian handling. */
3064 aarch64_expand_sve_mem_move (rtx dest
, rtx src
, machine_mode pred_mode
)
3066 machine_mode mode
= GET_MODE (dest
);
3067 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
3068 if (!register_operand (src
, mode
)
3069 && !register_operand (dest
, mode
))
3071 rtx tmp
= gen_reg_rtx (mode
);
3073 aarch64_emit_sve_pred_move (tmp
, ptrue
, src
);
3075 emit_move_insn (tmp
, src
);
3078 aarch64_emit_sve_pred_move (dest
, ptrue
, src
);
3081 /* Called only on big-endian targets. See whether an SVE vector move
3082 from SRC to DEST is effectively a REV[BHW] instruction, because at
3083 least one operand is a subreg of an SVE vector that has wider or
3084 narrower elements. Return true and emit the instruction if so.
3088 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3090 represents a VIEW_CONVERT between the following vectors, viewed
3093 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3094 R1: { [0], [1], [2], [3], ... }
3096 The high part of lane X in R2 should therefore correspond to lane X*2
3097 of R1, but the register representations are:
3100 R2: ...... [1].high [1].low [0].high [0].low
3101 R1: ...... [3] [2] [1] [0]
3103 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3104 We therefore need a reverse operation to swap the high and low values
3107 This is purely an optimization. Without it we would spill the
3108 subreg operand to the stack in one mode and reload it in the
3109 other mode, which has the same effect as the REV. */
3112 aarch64_maybe_expand_sve_subreg_move (rtx dest
, rtx src
)
3114 gcc_assert (BYTES_BIG_ENDIAN
);
3115 if (GET_CODE (dest
) == SUBREG
)
3116 dest
= SUBREG_REG (dest
);
3117 if (GET_CODE (src
) == SUBREG
)
3118 src
= SUBREG_REG (src
);
3120 /* The optimization handles two single SVE REGs with different element
3124 || aarch64_classify_vector_mode (GET_MODE (dest
)) != VEC_SVE_DATA
3125 || aarch64_classify_vector_mode (GET_MODE (src
)) != VEC_SVE_DATA
3126 || (GET_MODE_UNIT_SIZE (GET_MODE (dest
))
3127 == GET_MODE_UNIT_SIZE (GET_MODE (src
))))
3130 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3131 rtx ptrue
= force_reg (VNx16BImode
, CONSTM1_RTX (VNx16BImode
));
3132 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (dest
), gen_rtvec (2, ptrue
, src
),
3134 emit_insn (gen_rtx_SET (dest
, unspec
));
3138 /* Return a copy of X with mode MODE, without changing its other
3139 attributes. Unlike gen_lowpart, this doesn't care whether the
3140 mode change is valid. */
3143 aarch64_replace_reg_mode (rtx x
, machine_mode mode
)
3145 if (GET_MODE (x
) == mode
)
3148 x
= shallow_copy_rtx (x
);
3149 set_mode_and_regno (x
, mode
, REGNO (x
));
3153 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3157 aarch64_split_sve_subreg_move (rtx dest
, rtx ptrue
, rtx src
)
3159 /* Decide which REV operation we need. The mode with narrower elements
3160 determines the mode of the operands and the mode with the wider
3161 elements determines the reverse width. */
3162 machine_mode mode_with_wider_elts
= GET_MODE (dest
);
3163 machine_mode mode_with_narrower_elts
= GET_MODE (src
);
3164 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts
)
3165 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts
))
3166 std::swap (mode_with_wider_elts
, mode_with_narrower_elts
);
3168 unsigned int wider_bytes
= GET_MODE_UNIT_SIZE (mode_with_wider_elts
);
3169 unsigned int unspec
;
3170 if (wider_bytes
== 8)
3171 unspec
= UNSPEC_REV64
;
3172 else if (wider_bytes
== 4)
3173 unspec
= UNSPEC_REV32
;
3174 else if (wider_bytes
== 2)
3175 unspec
= UNSPEC_REV16
;
3178 machine_mode pred_mode
= aarch64_sve_pred_mode (wider_bytes
).require ();
3182 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3183 UNSPEC_MERGE_PTRUE))
3185 with the appropriate modes. */
3186 ptrue
= gen_lowpart (pred_mode
, ptrue
);
3187 dest
= aarch64_replace_reg_mode (dest
, mode_with_narrower_elts
);
3188 src
= aarch64_replace_reg_mode (src
, mode_with_narrower_elts
);
3189 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (1, src
), unspec
);
3190 src
= gen_rtx_UNSPEC (mode_with_narrower_elts
, gen_rtvec (2, ptrue
, src
),
3191 UNSPEC_MERGE_PTRUE
);
3192 emit_insn (gen_rtx_SET (dest
, src
));
3196 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED
,
3197 tree exp ATTRIBUTE_UNUSED
)
3199 /* Currently, always true. */
3203 /* Implement TARGET_PASS_BY_REFERENCE. */
3206 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED
,
3209 bool named ATTRIBUTE_UNUSED
)
3212 machine_mode dummymode
;
3215 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3216 if (mode
== BLKmode
&& type
)
3217 size
= int_size_in_bytes (type
);
3219 /* No frontends can create types with variable-sized modes, so we
3220 shouldn't be asked to pass or return them. */
3221 size
= GET_MODE_SIZE (mode
).to_constant ();
3223 /* Aggregates are passed by reference based on their size. */
3224 if (type
&& AGGREGATE_TYPE_P (type
))
3226 size
= int_size_in_bytes (type
);
3229 /* Variable sized arguments are always returned by reference. */
3233 /* Can this be a candidate to be passed in fp/simd register(s)? */
3234 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3239 /* Arguments which are variable sized or larger than 2 registers are
3240 passed by reference unless they are a homogenous floating point
3242 return size
> 2 * UNITS_PER_WORD
;
3245 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3247 aarch64_return_in_msb (const_tree valtype
)
3249 machine_mode dummy_mode
;
3252 /* Never happens in little-endian mode. */
3253 if (!BYTES_BIG_ENDIAN
)
3256 /* Only composite types smaller than or equal to 16 bytes can
3257 be potentially returned in registers. */
3258 if (!aarch64_composite_type_p (valtype
, TYPE_MODE (valtype
))
3259 || int_size_in_bytes (valtype
) <= 0
3260 || int_size_in_bytes (valtype
) > 16)
3263 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3264 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3265 is always passed/returned in the least significant bits of fp/simd
3267 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype
), valtype
,
3268 &dummy_mode
, &dummy_int
, NULL
))
3274 /* Implement TARGET_FUNCTION_VALUE.
3275 Define how to find the value returned by a function. */
3278 aarch64_function_value (const_tree type
, const_tree func
,
3279 bool outgoing ATTRIBUTE_UNUSED
)
3284 machine_mode ag_mode
;
3286 mode
= TYPE_MODE (type
);
3287 if (INTEGRAL_TYPE_P (type
))
3288 mode
= promote_function_mode (type
, mode
, &unsignedp
, func
, 1);
3290 if (aarch64_return_in_msb (type
))
3292 HOST_WIDE_INT size
= int_size_in_bytes (type
);
3294 if (size
% UNITS_PER_WORD
!= 0)
3296 size
+= UNITS_PER_WORD
- size
% UNITS_PER_WORD
;
3297 mode
= int_mode_for_size (size
* BITS_PER_UNIT
, 0).require ();
3301 if (aarch64_vfp_is_call_or_return_candidate (mode
, type
,
3302 &ag_mode
, &count
, NULL
))
3304 if (!aarch64_composite_type_p (type
, mode
))
3306 gcc_assert (count
== 1 && mode
== ag_mode
);
3307 return gen_rtx_REG (mode
, V0_REGNUM
);
3314 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (count
));
3315 for (i
= 0; i
< count
; i
++)
3317 rtx tmp
= gen_rtx_REG (ag_mode
, V0_REGNUM
+ i
);
3318 rtx offset
= gen_int_mode (i
* GET_MODE_SIZE (ag_mode
), Pmode
);
3319 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3320 XVECEXP (par
, 0, i
) = tmp
;
3326 return gen_rtx_REG (mode
, R0_REGNUM
);
3329 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3330 Return true if REGNO is the number of a hard register in which the values
3331 of called function may come back. */
3334 aarch64_function_value_regno_p (const unsigned int regno
)
3336 /* Maximum of 16 bytes can be returned in the general registers. Examples
3337 of 16-byte return values are: 128-bit integers and 16-byte small
3338 structures (excluding homogeneous floating-point aggregates). */
3339 if (regno
== R0_REGNUM
|| regno
== R1_REGNUM
)
3342 /* Up to four fp/simd registers can return a function value, e.g. a
3343 homogeneous floating-point aggregate having four members. */
3344 if (regno
>= V0_REGNUM
&& regno
< V0_REGNUM
+ HA_MAX_NUM_FLDS
)
3345 return TARGET_FLOAT
;
3350 /* Implement TARGET_RETURN_IN_MEMORY.
3352 If the type T of the result of a function is such that
3354 would require that arg be passed as a value in a register (or set of
3355 registers) according to the parameter passing rules, then the result
3356 is returned in the same registers as would be used for such an
3360 aarch64_return_in_memory (const_tree type
, const_tree fndecl ATTRIBUTE_UNUSED
)
3363 machine_mode ag_mode
;
3366 if (!AGGREGATE_TYPE_P (type
)
3367 && TREE_CODE (type
) != COMPLEX_TYPE
3368 && TREE_CODE (type
) != VECTOR_TYPE
)
3369 /* Simple scalar types always returned in registers. */
3372 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
),
3379 /* Types larger than 2 registers returned in memory. */
3380 size
= int_size_in_bytes (type
);
3381 return (size
< 0 || size
> 2 * UNITS_PER_WORD
);
3385 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v
, machine_mode mode
,
3386 const_tree type
, int *nregs
)
3388 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3389 return aarch64_vfp_is_call_or_return_candidate (mode
,
3391 &pcum
->aapcs_vfp_rmode
,
3396 /* Given MODE and TYPE of a function argument, return the alignment in
3397 bits. The idea is to suppress any stronger alignment requested by
3398 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3399 This is a helper function for local use only. */
3402 aarch64_function_arg_alignment (machine_mode mode
, const_tree type
)
3405 return GET_MODE_ALIGNMENT (mode
);
3407 if (integer_zerop (TYPE_SIZE (type
)))
3410 gcc_assert (TYPE_MODE (type
) == mode
);
3412 if (!AGGREGATE_TYPE_P (type
))
3413 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type
));
3415 if (TREE_CODE (type
) == ARRAY_TYPE
)
3416 return TYPE_ALIGN (TREE_TYPE (type
));
3418 unsigned int alignment
= 0;
3419 for (tree field
= TYPE_FIELDS (type
); field
; field
= DECL_CHAIN (field
))
3420 if (TREE_CODE (field
) == FIELD_DECL
)
3421 alignment
= std::max (alignment
, DECL_ALIGN (field
));
3426 /* Layout a function argument according to the AAPCS64 rules. The rule
3427 numbers refer to the rule numbers in the AAPCS64. */
3430 aarch64_layout_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3432 bool named ATTRIBUTE_UNUSED
)
3434 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3435 int ncrn
, nvrn
, nregs
;
3436 bool allocate_ncrn
, allocate_nvrn
;
3439 /* We need to do this once per argument. */
3440 if (pcum
->aapcs_arg_processed
)
3443 pcum
->aapcs_arg_processed
= true;
3445 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3447 size
= int_size_in_bytes (type
);
3449 /* No frontends can create types with variable-sized modes, so we
3450 shouldn't be asked to pass or return them. */
3451 size
= GET_MODE_SIZE (mode
).to_constant ();
3452 size
= ROUND_UP (size
, UNITS_PER_WORD
);
3454 allocate_ncrn
= (type
) ? !(FLOAT_TYPE_P (type
)) : !FLOAT_MODE_P (mode
);
3455 allocate_nvrn
= aarch64_vfp_is_call_candidate (pcum_v
,
3460 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3461 The following code thus handles passing by SIMD/FP registers first. */
3463 nvrn
= pcum
->aapcs_nvrn
;
3465 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3466 and homogenous short-vector aggregates (HVA). */
3470 aarch64_err_no_fpadvsimd (mode
);
3472 if (nvrn
+ nregs
<= NUM_FP_ARG_REGS
)
3474 pcum
->aapcs_nextnvrn
= nvrn
+ nregs
;
3475 if (!aarch64_composite_type_p (type
, mode
))
3477 gcc_assert (nregs
== 1);
3478 pcum
->aapcs_reg
= gen_rtx_REG (mode
, V0_REGNUM
+ nvrn
);
3484 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3485 for (i
= 0; i
< nregs
; i
++)
3487 rtx tmp
= gen_rtx_REG (pcum
->aapcs_vfp_rmode
,
3488 V0_REGNUM
+ nvrn
+ i
);
3489 rtx offset
= gen_int_mode
3490 (i
* GET_MODE_SIZE (pcum
->aapcs_vfp_rmode
), Pmode
);
3491 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
, offset
);
3492 XVECEXP (par
, 0, i
) = tmp
;
3494 pcum
->aapcs_reg
= par
;
3500 /* C.3 NSRN is set to 8. */
3501 pcum
->aapcs_nextnvrn
= NUM_FP_ARG_REGS
;
3506 ncrn
= pcum
->aapcs_ncrn
;
3507 nregs
= size
/ UNITS_PER_WORD
;
3509 /* C6 - C9. though the sign and zero extension semantics are
3510 handled elsewhere. This is the case where the argument fits
3511 entirely general registers. */
3512 if (allocate_ncrn
&& (ncrn
+ nregs
<= NUM_ARG_REGS
))
3515 gcc_assert (nregs
== 0 || nregs
== 1 || nregs
== 2);
3517 /* C.8 if the argument has an alignment of 16 then the NGRN is
3518 rounded up to the next even number. */
3521 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3522 comparison is there because for > 16 * BITS_PER_UNIT
3523 alignment nregs should be > 2 and therefore it should be
3524 passed by reference rather than value. */
3525 && aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3528 gcc_assert (ncrn
+ nregs
<= NUM_ARG_REGS
);
3531 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3532 A reg is still generated for it, but the caller should be smart
3533 enough not to use it. */
3534 if (nregs
== 0 || nregs
== 1 || GET_MODE_CLASS (mode
) == MODE_INT
)
3535 pcum
->aapcs_reg
= gen_rtx_REG (mode
, R0_REGNUM
+ ncrn
);
3541 par
= gen_rtx_PARALLEL (mode
, rtvec_alloc (nregs
));
3542 for (i
= 0; i
< nregs
; i
++)
3544 rtx tmp
= gen_rtx_REG (word_mode
, R0_REGNUM
+ ncrn
+ i
);
3545 tmp
= gen_rtx_EXPR_LIST (VOIDmode
, tmp
,
3546 GEN_INT (i
* UNITS_PER_WORD
));
3547 XVECEXP (par
, 0, i
) = tmp
;
3549 pcum
->aapcs_reg
= par
;
3552 pcum
->aapcs_nextncrn
= ncrn
+ nregs
;
3557 pcum
->aapcs_nextncrn
= NUM_ARG_REGS
;
3559 /* The argument is passed on stack; record the needed number of words for
3560 this argument and align the total size if necessary. */
3562 pcum
->aapcs_stack_words
= size
/ UNITS_PER_WORD
;
3564 if (aarch64_function_arg_alignment (mode
, type
) == 16 * BITS_PER_UNIT
)
3565 pcum
->aapcs_stack_size
= ROUND_UP (pcum
->aapcs_stack_size
,
3566 16 / UNITS_PER_WORD
);
3570 /* Implement TARGET_FUNCTION_ARG. */
3573 aarch64_function_arg (cumulative_args_t pcum_v
, machine_mode mode
,
3574 const_tree type
, bool named
)
3576 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3577 gcc_assert (pcum
->pcs_variant
== ARM_PCS_AAPCS64
);
3579 if (mode
== VOIDmode
)
3582 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3583 return pcum
->aapcs_reg
;
3587 aarch64_init_cumulative_args (CUMULATIVE_ARGS
*pcum
,
3588 const_tree fntype ATTRIBUTE_UNUSED
,
3589 rtx libname ATTRIBUTE_UNUSED
,
3590 const_tree fndecl ATTRIBUTE_UNUSED
,
3591 unsigned n_named ATTRIBUTE_UNUSED
)
3593 pcum
->aapcs_ncrn
= 0;
3594 pcum
->aapcs_nvrn
= 0;
3595 pcum
->aapcs_nextncrn
= 0;
3596 pcum
->aapcs_nextnvrn
= 0;
3597 pcum
->pcs_variant
= ARM_PCS_AAPCS64
;
3598 pcum
->aapcs_reg
= NULL_RTX
;
3599 pcum
->aapcs_arg_processed
= false;
3600 pcum
->aapcs_stack_words
= 0;
3601 pcum
->aapcs_stack_size
= 0;
3604 && fndecl
&& TREE_PUBLIC (fndecl
)
3605 && fntype
&& fntype
!= error_mark_node
)
3607 const_tree type
= TREE_TYPE (fntype
);
3608 machine_mode mode ATTRIBUTE_UNUSED
; /* To pass pointer as argument. */
3609 int nregs ATTRIBUTE_UNUSED
; /* Likewise. */
3610 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type
), type
,
3611 &mode
, &nregs
, NULL
))
3612 aarch64_err_no_fpadvsimd (TYPE_MODE (type
));
3618 aarch64_function_arg_advance (cumulative_args_t pcum_v
,
3623 CUMULATIVE_ARGS
*pcum
= get_cumulative_args (pcum_v
);
3624 if (pcum
->pcs_variant
== ARM_PCS_AAPCS64
)
3626 aarch64_layout_arg (pcum_v
, mode
, type
, named
);
3627 gcc_assert ((pcum
->aapcs_reg
!= NULL_RTX
)
3628 != (pcum
->aapcs_stack_words
!= 0));
3629 pcum
->aapcs_arg_processed
= false;
3630 pcum
->aapcs_ncrn
= pcum
->aapcs_nextncrn
;
3631 pcum
->aapcs_nvrn
= pcum
->aapcs_nextnvrn
;
3632 pcum
->aapcs_stack_size
+= pcum
->aapcs_stack_words
;
3633 pcum
->aapcs_stack_words
= 0;
3634 pcum
->aapcs_reg
= NULL_RTX
;
3639 aarch64_function_arg_regno_p (unsigned regno
)
3641 return ((GP_REGNUM_P (regno
) && regno
< R0_REGNUM
+ NUM_ARG_REGS
)
3642 || (FP_REGNUM_P (regno
) && regno
< V0_REGNUM
+ NUM_FP_ARG_REGS
));
3645 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3646 PARM_BOUNDARY bits of alignment, but will be given anything up
3647 to STACK_BOUNDARY bits if the type requires it. This makes sure
3648 that both before and after the layout of each argument, the Next
3649 Stacked Argument Address (NSAA) will have a minimum alignment of
3653 aarch64_function_arg_boundary (machine_mode mode
, const_tree type
)
3655 unsigned int alignment
= aarch64_function_arg_alignment (mode
, type
);
3656 return MIN (MAX (alignment
, PARM_BOUNDARY
), STACK_BOUNDARY
);
3659 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3661 static fixed_size_mode
3662 aarch64_get_reg_raw_mode (int regno
)
3664 if (TARGET_SVE
&& FP_REGNUM_P (regno
))
3665 /* Don't use the SVE part of the register for __builtin_apply and
3666 __builtin_return. The SVE registers aren't used by the normal PCS,
3667 so using them there would be a waste of time. The PCS extensions
3668 for SVE types are fundamentally incompatible with the
3669 __builtin_return/__builtin_apply interface. */
3670 return as_a
<fixed_size_mode
> (V16QImode
);
3671 return default_get_reg_raw_mode (regno
);
3674 /* Implement TARGET_FUNCTION_ARG_PADDING.
3676 Small aggregate types are placed in the lowest memory address.
3678 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3680 static pad_direction
3681 aarch64_function_arg_padding (machine_mode mode
, const_tree type
)
3683 /* On little-endian targets, the least significant byte of every stack
3684 argument is passed at the lowest byte address of the stack slot. */
3685 if (!BYTES_BIG_ENDIAN
)
3688 /* Otherwise, integral, floating-point and pointer types are padded downward:
3689 the least significant byte of a stack argument is passed at the highest
3690 byte address of the stack slot. */
3692 ? (INTEGRAL_TYPE_P (type
) || SCALAR_FLOAT_TYPE_P (type
)
3693 || POINTER_TYPE_P (type
))
3694 : (SCALAR_INT_MODE_P (mode
) || SCALAR_FLOAT_MODE_P (mode
)))
3695 return PAD_DOWNWARD
;
3697 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3701 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3703 It specifies padding for the last (may also be the only)
3704 element of a block move between registers and memory. If
3705 assuming the block is in the memory, padding upward means that
3706 the last element is padded after its highest significant byte,
3707 while in downward padding, the last element is padded at the
3708 its least significant byte side.
3710 Small aggregates and small complex types are always padded
3713 We don't need to worry about homogeneous floating-point or
3714 short-vector aggregates; their move is not affected by the
3715 padding direction determined here. Regardless of endianness,
3716 each element of such an aggregate is put in the least
3717 significant bits of a fp/simd register.
3719 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3720 register has useful data, and return the opposite if the most
3721 significant byte does. */
3724 aarch64_pad_reg_upward (machine_mode mode
, const_tree type
,
3725 bool first ATTRIBUTE_UNUSED
)
3728 /* Small composite types are always padded upward. */
3729 if (BYTES_BIG_ENDIAN
&& aarch64_composite_type_p (type
, mode
))
3733 size
= int_size_in_bytes (type
);
3735 /* No frontends can create types with variable-sized modes, so we
3736 shouldn't be asked to pass or return them. */
3737 size
= GET_MODE_SIZE (mode
).to_constant ();
3738 if (size
< 2 * UNITS_PER_WORD
)
3742 /* Otherwise, use the default padding. */
3743 return !BYTES_BIG_ENDIAN
;
3746 static scalar_int_mode
3747 aarch64_libgcc_cmp_return_mode (void)
3752 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3754 /* We use the 12-bit shifted immediate arithmetic instructions so values
3755 must be multiple of (1 << 12), i.e. 4096. */
3756 #define ARITH_FACTOR 4096
3758 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3759 #error Cannot use simple address calculation for stack probing
3762 /* The pair of scratch registers used for stack probing. */
3763 #define PROBE_STACK_FIRST_REG 9
3764 #define PROBE_STACK_SECOND_REG 10
3766 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3767 inclusive. These are offsets from the current stack pointer. */
3770 aarch64_emit_probe_stack_range (HOST_WIDE_INT first
, poly_int64 poly_size
)
3773 if (!poly_size
.is_constant (&size
))
3775 sorry ("stack probes for SVE frames");
3779 rtx reg1
= gen_rtx_REG (Pmode
, PROBE_STACK_FIRST_REG
);
3781 /* See the same assertion on PROBE_INTERVAL above. */
3782 gcc_assert ((first
% ARITH_FACTOR
) == 0);
3784 /* See if we have a constant small number of probes to generate. If so,
3785 that's the easy case. */
3786 if (size
<= PROBE_INTERVAL
)
3788 const HOST_WIDE_INT base
= ROUND_UP (size
, ARITH_FACTOR
);
3790 emit_set_insn (reg1
,
3791 plus_constant (Pmode
,
3792 stack_pointer_rtx
, -(first
+ base
)));
3793 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- size
));
3796 /* The run-time loop is made up of 8 insns in the generic case while the
3797 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3798 else if (size
<= 4 * PROBE_INTERVAL
)
3800 HOST_WIDE_INT i
, rem
;
3802 emit_set_insn (reg1
,
3803 plus_constant (Pmode
,
3805 -(first
+ PROBE_INTERVAL
)));
3806 emit_stack_probe (reg1
);
3808 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3809 it exceeds SIZE. If only two probes are needed, this will not
3810 generate any code. Then probe at FIRST + SIZE. */
3811 for (i
= 2 * PROBE_INTERVAL
; i
< size
; i
+= PROBE_INTERVAL
)
3813 emit_set_insn (reg1
,
3814 plus_constant (Pmode
, reg1
, -PROBE_INTERVAL
));
3815 emit_stack_probe (reg1
);
3818 rem
= size
- (i
- PROBE_INTERVAL
);
3821 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3823 emit_set_insn (reg1
, plus_constant (Pmode
, reg1
, -base
));
3824 emit_stack_probe (plus_constant (Pmode
, reg1
, base
- rem
));
3827 emit_stack_probe (plus_constant (Pmode
, reg1
, -rem
));
3830 /* Otherwise, do the same as above, but in a loop. Note that we must be
3831 extra careful with variables wrapping around because we might be at
3832 the very top (or the very bottom) of the address space and we have
3833 to be able to handle this case properly; in particular, we use an
3834 equality test for the loop condition. */
3837 rtx reg2
= gen_rtx_REG (Pmode
, PROBE_STACK_SECOND_REG
);
3839 /* Step 1: round SIZE to the previous multiple of the interval. */
3841 HOST_WIDE_INT rounded_size
= size
& -PROBE_INTERVAL
;
3844 /* Step 2: compute initial and final value of the loop counter. */
3846 /* TEST_ADDR = SP + FIRST. */
3847 emit_set_insn (reg1
,
3848 plus_constant (Pmode
, stack_pointer_rtx
, -first
));
3850 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3851 HOST_WIDE_INT adjustment
= - (first
+ rounded_size
);
3852 if (! aarch64_uimm12_shift (adjustment
))
3854 aarch64_internal_mov_immediate (reg2
, GEN_INT (adjustment
),
3856 emit_set_insn (reg2
, gen_rtx_PLUS (Pmode
, stack_pointer_rtx
, reg2
));
3859 emit_set_insn (reg2
,
3860 plus_constant (Pmode
, stack_pointer_rtx
, adjustment
));
3866 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3869 while (TEST_ADDR != LAST_ADDR)
3871 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3872 until it is equal to ROUNDED_SIZE. */
3874 emit_insn (gen_probe_stack_range (reg1
, reg1
, reg2
));
3877 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3878 that SIZE is equal to ROUNDED_SIZE. */
3880 if (size
!= rounded_size
)
3882 HOST_WIDE_INT rem
= size
- rounded_size
;
3886 const HOST_WIDE_INT base
= ROUND_UP (rem
, ARITH_FACTOR
);
3888 emit_set_insn (reg2
, plus_constant (Pmode
, reg2
, -base
));
3889 emit_stack_probe (plus_constant (Pmode
, reg2
, base
- rem
));
3892 emit_stack_probe (plus_constant (Pmode
, reg2
, -rem
));
3896 /* Make sure nothing is scheduled before we are done. */
3897 emit_insn (gen_blockage ());
3900 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3901 absolute addresses. */
3904 aarch64_output_probe_stack_range (rtx reg1
, rtx reg2
)
3906 static int labelno
= 0;
3910 ASM_GENERATE_INTERNAL_LABEL (loop_lab
, "LPSRL", labelno
++);
3913 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file
, loop_lab
);
3915 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3917 xops
[1] = GEN_INT (PROBE_INTERVAL
);
3918 output_asm_insn ("sub\t%0, %0, %1", xops
);
3920 /* Probe at TEST_ADDR. */
3921 output_asm_insn ("str\txzr, [%0]", xops
);
3923 /* Test if TEST_ADDR == LAST_ADDR. */
3925 output_asm_insn ("cmp\t%0, %1", xops
);
3928 fputs ("\tb.ne\t", asm_out_file
);
3929 assemble_name_raw (asm_out_file
, loop_lab
);
3930 fputc ('\n', asm_out_file
);
3935 /* Determine whether a frame chain needs to be generated. */
3937 aarch64_needs_frame_chain (void)
3939 /* Force a frame chain for EH returns so the return address is at FP+8. */
3940 if (frame_pointer_needed
|| crtl
->calls_eh_return
)
3943 /* A leaf function cannot have calls or write LR. */
3944 bool is_leaf
= crtl
->is_leaf
&& !df_regs_ever_live_p (LR_REGNUM
);
3946 /* Don't use a frame chain in leaf functions if leaf frame pointers
3948 if (flag_omit_leaf_frame_pointer
&& is_leaf
)
3951 return aarch64_use_frame_pointer
;
3954 /* Mark the registers that need to be saved by the callee and calculate
3955 the size of the callee-saved registers area and frame record (both FP
3956 and LR may be omitted). */
3958 aarch64_layout_frame (void)
3960 HOST_WIDE_INT offset
= 0;
3961 int regno
, last_fp_reg
= INVALID_REGNUM
;
3963 if (reload_completed
&& cfun
->machine
->frame
.laid_out
)
3966 cfun
->machine
->frame
.emit_frame_chain
= aarch64_needs_frame_chain ();
3968 #define SLOT_NOT_REQUIRED (-2)
3969 #define SLOT_REQUIRED (-1)
3971 cfun
->machine
->frame
.wb_candidate1
= INVALID_REGNUM
;
3972 cfun
->machine
->frame
.wb_candidate2
= INVALID_REGNUM
;
3974 /* First mark all the registers that really need to be saved... */
3975 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3976 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
3978 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
3979 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_NOT_REQUIRED
;
3981 /* ... that includes the eh data registers (if needed)... */
3982 if (crtl
->calls_eh_return
)
3983 for (regno
= 0; EH_RETURN_DATA_REGNO (regno
) != INVALID_REGNUM
; regno
++)
3984 cfun
->machine
->frame
.reg_offset
[EH_RETURN_DATA_REGNO (regno
)]
3987 /* ... and any callee saved register that dataflow says is live. */
3988 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
3989 if (df_regs_ever_live_p (regno
)
3990 && (regno
== R30_REGNUM
3991 || !call_used_regs
[regno
]))
3992 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
3994 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
3995 if (df_regs_ever_live_p (regno
)
3996 && !call_used_regs
[regno
])
3998 cfun
->machine
->frame
.reg_offset
[regno
] = SLOT_REQUIRED
;
3999 last_fp_reg
= regno
;
4002 if (cfun
->machine
->frame
.emit_frame_chain
)
4004 /* FP and LR are placed in the linkage record. */
4005 cfun
->machine
->frame
.reg_offset
[R29_REGNUM
] = 0;
4006 cfun
->machine
->frame
.wb_candidate1
= R29_REGNUM
;
4007 cfun
->machine
->frame
.reg_offset
[R30_REGNUM
] = UNITS_PER_WORD
;
4008 cfun
->machine
->frame
.wb_candidate2
= R30_REGNUM
;
4009 offset
= 2 * UNITS_PER_WORD
;
4012 /* Now assign stack slots for them. */
4013 for (regno
= R0_REGNUM
; regno
<= R30_REGNUM
; regno
++)
4014 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4016 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4017 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4018 cfun
->machine
->frame
.wb_candidate1
= regno
;
4019 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
)
4020 cfun
->machine
->frame
.wb_candidate2
= regno
;
4021 offset
+= UNITS_PER_WORD
;
4024 HOST_WIDE_INT max_int_offset
= offset
;
4025 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4026 bool has_align_gap
= offset
!= max_int_offset
;
4028 for (regno
= V0_REGNUM
; regno
<= V31_REGNUM
; regno
++)
4029 if (cfun
->machine
->frame
.reg_offset
[regno
] == SLOT_REQUIRED
)
4031 /* If there is an alignment gap between integer and fp callee-saves,
4032 allocate the last fp register to it if possible. */
4033 if (regno
== last_fp_reg
&& has_align_gap
&& (offset
& 8) == 0)
4035 cfun
->machine
->frame
.reg_offset
[regno
] = max_int_offset
;
4039 cfun
->machine
->frame
.reg_offset
[regno
] = offset
;
4040 if (cfun
->machine
->frame
.wb_candidate1
== INVALID_REGNUM
)
4041 cfun
->machine
->frame
.wb_candidate1
= regno
;
4042 else if (cfun
->machine
->frame
.wb_candidate2
== INVALID_REGNUM
4043 && cfun
->machine
->frame
.wb_candidate1
>= V0_REGNUM
)
4044 cfun
->machine
->frame
.wb_candidate2
= regno
;
4045 offset
+= UNITS_PER_WORD
;
4048 offset
= ROUND_UP (offset
, STACK_BOUNDARY
/ BITS_PER_UNIT
);
4050 cfun
->machine
->frame
.saved_regs_size
= offset
;
4052 HOST_WIDE_INT varargs_and_saved_regs_size
4053 = offset
+ cfun
->machine
->frame
.saved_varargs_size
;
4055 cfun
->machine
->frame
.hard_fp_offset
4056 = aligned_upper_bound (varargs_and_saved_regs_size
4057 + get_frame_size (),
4058 STACK_BOUNDARY
/ BITS_PER_UNIT
);
4060 /* Both these values are already aligned. */
4061 gcc_assert (multiple_p (crtl
->outgoing_args_size
,
4062 STACK_BOUNDARY
/ BITS_PER_UNIT
));
4063 cfun
->machine
->frame
.frame_size
4064 = (cfun
->machine
->frame
.hard_fp_offset
4065 + crtl
->outgoing_args_size
);
4067 cfun
->machine
->frame
.locals_offset
= cfun
->machine
->frame
.saved_varargs_size
;
4069 cfun
->machine
->frame
.initial_adjust
= 0;
4070 cfun
->machine
->frame
.final_adjust
= 0;
4071 cfun
->machine
->frame
.callee_adjust
= 0;
4072 cfun
->machine
->frame
.callee_offset
= 0;
4074 HOST_WIDE_INT max_push_offset
= 0;
4075 if (cfun
->machine
->frame
.wb_candidate2
!= INVALID_REGNUM
)
4076 max_push_offset
= 512;
4077 else if (cfun
->machine
->frame
.wb_candidate1
!= INVALID_REGNUM
)
4078 max_push_offset
= 256;
4080 HOST_WIDE_INT const_size
, const_fp_offset
;
4081 if (cfun
->machine
->frame
.frame_size
.is_constant (&const_size
)
4082 && const_size
< max_push_offset
4083 && known_eq (crtl
->outgoing_args_size
, 0))
4085 /* Simple, small frame with no outgoing arguments:
4086 stp reg1, reg2, [sp, -frame_size]!
4087 stp reg3, reg4, [sp, 16] */
4088 cfun
->machine
->frame
.callee_adjust
= const_size
;
4090 else if (known_lt (crtl
->outgoing_args_size
4091 + cfun
->machine
->frame
.saved_regs_size
, 512)
4092 && !(cfun
->calls_alloca
4093 && known_lt (cfun
->machine
->frame
.hard_fp_offset
,
4096 /* Frame with small outgoing arguments:
4097 sub sp, sp, frame_size
4098 stp reg1, reg2, [sp, outgoing_args_size]
4099 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4100 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.frame_size
;
4101 cfun
->machine
->frame
.callee_offset
4102 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.hard_fp_offset
;
4104 else if (cfun
->machine
->frame
.hard_fp_offset
.is_constant (&const_fp_offset
)
4105 && const_fp_offset
< max_push_offset
)
4107 /* Frame with large outgoing arguments but a small local area:
4108 stp reg1, reg2, [sp, -hard_fp_offset]!
4109 stp reg3, reg4, [sp, 16]
4110 sub sp, sp, outgoing_args_size */
4111 cfun
->machine
->frame
.callee_adjust
= const_fp_offset
;
4112 cfun
->machine
->frame
.final_adjust
4113 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.callee_adjust
;
4117 /* Frame with large local area and outgoing arguments using frame pointer:
4118 sub sp, sp, hard_fp_offset
4119 stp x29, x30, [sp, 0]
4121 stp reg3, reg4, [sp, 16]
4122 sub sp, sp, outgoing_args_size */
4123 cfun
->machine
->frame
.initial_adjust
= cfun
->machine
->frame
.hard_fp_offset
;
4124 cfun
->machine
->frame
.final_adjust
4125 = cfun
->machine
->frame
.frame_size
- cfun
->machine
->frame
.initial_adjust
;
4128 cfun
->machine
->frame
.laid_out
= true;
4131 /* Return true if the register REGNO is saved on entry to
4132 the current function. */
4135 aarch64_register_saved_on_entry (int regno
)
4137 return cfun
->machine
->frame
.reg_offset
[regno
] >= 0;
4140 /* Return the next register up from REGNO up to LIMIT for the callee
4144 aarch64_next_callee_save (unsigned regno
, unsigned limit
)
4146 while (regno
<= limit
&& !aarch64_register_saved_on_entry (regno
))
4151 /* Push the register number REGNO of mode MODE to the stack with write-back
4152 adjusting the stack by ADJUSTMENT. */
4155 aarch64_pushwb_single_reg (machine_mode mode
, unsigned regno
,
4156 HOST_WIDE_INT adjustment
)
4158 rtx base_rtx
= stack_pointer_rtx
;
4161 reg
= gen_rtx_REG (mode
, regno
);
4162 mem
= gen_rtx_PRE_MODIFY (Pmode
, base_rtx
,
4163 plus_constant (Pmode
, base_rtx
, -adjustment
));
4164 mem
= gen_frame_mem (mode
, mem
);
4166 insn
= emit_move_insn (mem
, reg
);
4167 RTX_FRAME_RELATED_P (insn
) = 1;
4170 /* Generate and return an instruction to store the pair of registers
4171 REG and REG2 of mode MODE to location BASE with write-back adjusting
4172 the stack location BASE by ADJUSTMENT. */
4175 aarch64_gen_storewb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4176 HOST_WIDE_INT adjustment
)
4181 return gen_storewb_pairdi_di (base
, base
, reg
, reg2
,
4182 GEN_INT (-adjustment
),
4183 GEN_INT (UNITS_PER_WORD
- adjustment
));
4185 return gen_storewb_pairdf_di (base
, base
, reg
, reg2
,
4186 GEN_INT (-adjustment
),
4187 GEN_INT (UNITS_PER_WORD
- adjustment
));
4193 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4194 stack pointer by ADJUSTMENT. */
4197 aarch64_push_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
)
4200 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4202 if (regno2
== INVALID_REGNUM
)
4203 return aarch64_pushwb_single_reg (mode
, regno1
, adjustment
);
4205 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4206 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4208 insn
= emit_insn (aarch64_gen_storewb_pair (mode
, stack_pointer_rtx
, reg1
,
4210 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 2)) = 1;
4211 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4212 RTX_FRAME_RELATED_P (insn
) = 1;
4215 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4216 adjusting it by ADJUSTMENT afterwards. */
4219 aarch64_gen_loadwb_pair (machine_mode mode
, rtx base
, rtx reg
, rtx reg2
,
4220 HOST_WIDE_INT adjustment
)
4225 return gen_loadwb_pairdi_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4226 GEN_INT (UNITS_PER_WORD
));
4228 return gen_loadwb_pairdf_di (base
, base
, reg
, reg2
, GEN_INT (adjustment
),
4229 GEN_INT (UNITS_PER_WORD
));
4235 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4236 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4240 aarch64_pop_regs (unsigned regno1
, unsigned regno2
, HOST_WIDE_INT adjustment
,
4243 machine_mode mode
= (regno1
<= R30_REGNUM
) ? E_DImode
: E_DFmode
;
4244 rtx reg1
= gen_rtx_REG (mode
, regno1
);
4246 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg1
, *cfi_ops
);
4248 if (regno2
== INVALID_REGNUM
)
4250 rtx mem
= plus_constant (Pmode
, stack_pointer_rtx
, adjustment
);
4251 mem
= gen_rtx_POST_MODIFY (Pmode
, stack_pointer_rtx
, mem
);
4252 emit_move_insn (reg1
, gen_frame_mem (mode
, mem
));
4256 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4257 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4258 emit_insn (aarch64_gen_loadwb_pair (mode
, stack_pointer_rtx
, reg1
,
4263 /* Generate and return a store pair instruction of mode MODE to store
4264 register REG1 to MEM1 and register REG2 to MEM2. */
4267 aarch64_gen_store_pair (machine_mode mode
, rtx mem1
, rtx reg1
, rtx mem2
,
4273 return gen_store_pair_dw_didi (mem1
, reg1
, mem2
, reg2
);
4276 return gen_store_pair_dw_dfdf (mem1
, reg1
, mem2
, reg2
);
4283 /* Generate and regurn a load pair isntruction of mode MODE to load register
4284 REG1 from MEM1 and register REG2 from MEM2. */
4287 aarch64_gen_load_pair (machine_mode mode
, rtx reg1
, rtx mem1
, rtx reg2
,
4293 return gen_load_pair_dw_didi (reg1
, mem1
, reg2
, mem2
);
4296 return gen_load_pair_dw_dfdf (reg1
, mem1
, reg2
, mem2
);
4303 /* Return TRUE if return address signing should be enabled for the current
4304 function, otherwise return FALSE. */
4307 aarch64_return_address_signing_enabled (void)
4309 /* This function should only be called after frame laid out. */
4310 gcc_assert (cfun
->machine
->frame
.laid_out
);
4312 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4313 if it's LR is pushed onto stack. */
4314 return (aarch64_ra_sign_scope
== AARCH64_FUNCTION_ALL
4315 || (aarch64_ra_sign_scope
== AARCH64_FUNCTION_NON_LEAF
4316 && cfun
->machine
->frame
.reg_offset
[LR_REGNUM
] >= 0));
4319 /* Emit code to save the callee-saved registers from register number START
4320 to LIMIT to the stack at the location starting at offset START_OFFSET,
4321 skipping any write-back candidates if SKIP_WB is true. */
4324 aarch64_save_callee_saves (machine_mode mode
, poly_int64 start_offset
,
4325 unsigned start
, unsigned limit
, bool skip_wb
)
4331 for (regno
= aarch64_next_callee_save (start
, limit
);
4333 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4339 && (regno
== cfun
->machine
->frame
.wb_candidate1
4340 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4343 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4346 reg
= gen_rtx_REG (mode
, regno
);
4347 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4348 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4351 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4354 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4355 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4356 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4359 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4362 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4363 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, stack_pointer_rtx
,
4365 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
,
4368 /* The first part of a frame-related parallel insn is
4369 always assumed to be relevant to the frame
4370 calculations; subsequent parts, are only
4371 frame-related if explicitly marked. */
4372 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn
), 0, 1)) = 1;
4376 insn
= emit_move_insn (mem
, reg
);
4378 RTX_FRAME_RELATED_P (insn
) = 1;
4382 /* Emit code to restore the callee registers of mode MODE from register
4383 number START up to and including LIMIT. Restore from the stack offset
4384 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4385 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4388 aarch64_restore_callee_saves (machine_mode mode
,
4389 poly_int64 start_offset
, unsigned start
,
4390 unsigned limit
, bool skip_wb
, rtx
*cfi_ops
)
4392 rtx base_rtx
= stack_pointer_rtx
;
4397 for (regno
= aarch64_next_callee_save (start
, limit
);
4399 regno
= aarch64_next_callee_save (regno
+ 1, limit
))
4401 if (cfun
->machine
->reg_is_wrapped_separately
[regno
])
4407 && (regno
== cfun
->machine
->frame
.wb_candidate1
4408 || regno
== cfun
->machine
->frame
.wb_candidate2
))
4411 reg
= gen_rtx_REG (mode
, regno
);
4412 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno
];
4413 mem
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4415 regno2
= aarch64_next_callee_save (regno
+ 1, limit
);
4418 && !cfun
->machine
->reg_is_wrapped_separately
[regno2
]
4419 && ((cfun
->machine
->frame
.reg_offset
[regno
] + UNITS_PER_WORD
)
4420 == cfun
->machine
->frame
.reg_offset
[regno2
]))
4422 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4425 offset
= start_offset
+ cfun
->machine
->frame
.reg_offset
[regno2
];
4426 mem2
= gen_frame_mem (mode
, plus_constant (Pmode
, base_rtx
, offset
));
4427 emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4429 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg2
, *cfi_ops
);
4433 emit_move_insn (reg
, mem
);
4434 *cfi_ops
= alloc_reg_note (REG_CFA_RESTORE
, reg
, *cfi_ops
);
4438 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4442 offset_4bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4444 HOST_WIDE_INT multiple
;
4445 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4446 && IN_RANGE (multiple
, -8, 7));
4449 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4453 offset_6bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4455 HOST_WIDE_INT multiple
;
4456 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4457 && IN_RANGE (multiple
, 0, 63));
4460 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4464 aarch64_offset_7bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4466 HOST_WIDE_INT multiple
;
4467 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4468 && IN_RANGE (multiple
, -64, 63));
4471 /* Return true if OFFSET is a signed 9-bit value. */
4474 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED
,
4477 HOST_WIDE_INT const_offset
;
4478 return (offset
.is_constant (&const_offset
)
4479 && IN_RANGE (const_offset
, -256, 255));
4482 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4486 offset_9bit_signed_scaled_p (machine_mode mode
, poly_int64 offset
)
4488 HOST_WIDE_INT multiple
;
4489 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4490 && IN_RANGE (multiple
, -256, 255));
4493 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4497 offset_12bit_unsigned_scaled_p (machine_mode mode
, poly_int64 offset
)
4499 HOST_WIDE_INT multiple
;
4500 return (constant_multiple_p (offset
, GET_MODE_SIZE (mode
), &multiple
)
4501 && IN_RANGE (multiple
, 0, 4095));
4504 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4507 aarch64_get_separate_components (void)
4509 aarch64_layout_frame ();
4511 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4512 bitmap_clear (components
);
4514 /* The registers we need saved to the frame. */
4515 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4516 if (aarch64_register_saved_on_entry (regno
))
4518 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4519 if (!frame_pointer_needed
)
4520 offset
+= cfun
->machine
->frame
.frame_size
4521 - cfun
->machine
->frame
.hard_fp_offset
;
4522 /* Check that we can access the stack slot of the register with one
4523 direct load with no adjustments needed. */
4524 if (offset_12bit_unsigned_scaled_p (DImode
, offset
))
4525 bitmap_set_bit (components
, regno
);
4528 /* Don't mess with the hard frame pointer. */
4529 if (frame_pointer_needed
)
4530 bitmap_clear_bit (components
, HARD_FRAME_POINTER_REGNUM
);
4532 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4533 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4534 /* If aarch64_layout_frame has chosen registers to store/restore with
4535 writeback don't interfere with them to avoid having to output explicit
4536 stack adjustment instructions. */
4537 if (reg2
!= INVALID_REGNUM
)
4538 bitmap_clear_bit (components
, reg2
);
4539 if (reg1
!= INVALID_REGNUM
)
4540 bitmap_clear_bit (components
, reg1
);
4542 bitmap_clear_bit (components
, LR_REGNUM
);
4543 bitmap_clear_bit (components
, SP_REGNUM
);
4548 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4551 aarch64_components_for_bb (basic_block bb
)
4553 bitmap in
= DF_LIVE_IN (bb
);
4554 bitmap gen
= &DF_LIVE_BB_INFO (bb
)->gen
;
4555 bitmap kill
= &DF_LIVE_BB_INFO (bb
)->kill
;
4557 sbitmap components
= sbitmap_alloc (LAST_SAVED_REGNUM
+ 1);
4558 bitmap_clear (components
);
4560 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4561 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4562 if ((!call_used_regs
[regno
])
4563 && (bitmap_bit_p (in
, regno
)
4564 || bitmap_bit_p (gen
, regno
)
4565 || bitmap_bit_p (kill
, regno
)))
4567 unsigned regno2
, offset
, offset2
;
4568 bitmap_set_bit (components
, regno
);
4570 /* If there is a callee-save at an adjacent offset, add it too
4571 to increase the use of LDP/STP. */
4572 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4573 regno2
= ((offset
& 8) == 0) ? regno
+ 1 : regno
- 1;
4575 if (regno2
<= LAST_SAVED_REGNUM
)
4577 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4578 if ((offset
& ~8) == (offset2
& ~8))
4579 bitmap_set_bit (components
, regno2
);
4586 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4587 Nothing to do for aarch64. */
4590 aarch64_disqualify_components (sbitmap
, edge
, sbitmap
, bool)
4594 /* Return the next set bit in BMP from START onwards. Return the total number
4595 of bits in BMP if no set bit is found at or after START. */
4598 aarch64_get_next_set_bit (sbitmap bmp
, unsigned int start
)
4600 unsigned int nbits
= SBITMAP_SIZE (bmp
);
4604 gcc_assert (start
< nbits
);
4605 for (unsigned int i
= start
; i
< nbits
; i
++)
4606 if (bitmap_bit_p (bmp
, i
))
4612 /* Do the work for aarch64_emit_prologue_components and
4613 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4614 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4615 for these components or the epilogue sequence. That is, it determines
4616 whether we should emit stores or loads and what kind of CFA notes to attach
4617 to the insns. Otherwise the logic for the two sequences is very
4621 aarch64_process_components (sbitmap components
, bool prologue_p
)
4623 rtx ptr_reg
= gen_rtx_REG (Pmode
, frame_pointer_needed
4624 ? HARD_FRAME_POINTER_REGNUM
4625 : STACK_POINTER_REGNUM
);
4627 unsigned last_regno
= SBITMAP_SIZE (components
);
4628 unsigned regno
= aarch64_get_next_set_bit (components
, R0_REGNUM
);
4629 rtx_insn
*insn
= NULL
;
4631 while (regno
!= last_regno
)
4633 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4634 so DFmode for the vector registers is enough. */
4635 machine_mode mode
= GP_REGNUM_P (regno
) ? E_DImode
: E_DFmode
;
4636 rtx reg
= gen_rtx_REG (mode
, regno
);
4637 poly_int64 offset
= cfun
->machine
->frame
.reg_offset
[regno
];
4638 if (!frame_pointer_needed
)
4639 offset
+= cfun
->machine
->frame
.frame_size
4640 - cfun
->machine
->frame
.hard_fp_offset
;
4641 rtx addr
= plus_constant (Pmode
, ptr_reg
, offset
);
4642 rtx mem
= gen_frame_mem (mode
, addr
);
4644 rtx set
= prologue_p
? gen_rtx_SET (mem
, reg
) : gen_rtx_SET (reg
, mem
);
4645 unsigned regno2
= aarch64_get_next_set_bit (components
, regno
+ 1);
4646 /* No more registers to handle after REGNO.
4647 Emit a single save/restore and exit. */
4648 if (regno2
== last_regno
)
4650 insn
= emit_insn (set
);
4651 RTX_FRAME_RELATED_P (insn
) = 1;
4653 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4655 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4659 poly_int64 offset2
= cfun
->machine
->frame
.reg_offset
[regno2
];
4660 /* The next register is not of the same class or its offset is not
4661 mergeable with the current one into a pair. */
4662 if (!satisfies_constraint_Ump (mem
)
4663 || GP_REGNUM_P (regno
) != GP_REGNUM_P (regno2
)
4664 || maybe_ne ((offset2
- cfun
->machine
->frame
.reg_offset
[regno
]),
4665 GET_MODE_SIZE (mode
)))
4667 insn
= emit_insn (set
);
4668 RTX_FRAME_RELATED_P (insn
) = 1;
4670 add_reg_note (insn
, REG_CFA_OFFSET
, copy_rtx (set
));
4672 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4678 /* REGNO2 can be saved/restored in a pair with REGNO. */
4679 rtx reg2
= gen_rtx_REG (mode
, regno2
);
4680 if (!frame_pointer_needed
)
4681 offset2
+= cfun
->machine
->frame
.frame_size
4682 - cfun
->machine
->frame
.hard_fp_offset
;
4683 rtx addr2
= plus_constant (Pmode
, ptr_reg
, offset2
);
4684 rtx mem2
= gen_frame_mem (mode
, addr2
);
4685 rtx set2
= prologue_p
? gen_rtx_SET (mem2
, reg2
)
4686 : gen_rtx_SET (reg2
, mem2
);
4689 insn
= emit_insn (aarch64_gen_store_pair (mode
, mem
, reg
, mem2
, reg2
));
4691 insn
= emit_insn (aarch64_gen_load_pair (mode
, reg
, mem
, reg2
, mem2
));
4693 RTX_FRAME_RELATED_P (insn
) = 1;
4696 add_reg_note (insn
, REG_CFA_OFFSET
, set
);
4697 add_reg_note (insn
, REG_CFA_OFFSET
, set2
);
4701 add_reg_note (insn
, REG_CFA_RESTORE
, reg
);
4702 add_reg_note (insn
, REG_CFA_RESTORE
, reg2
);
4705 regno
= aarch64_get_next_set_bit (components
, regno2
+ 1);
4709 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4712 aarch64_emit_prologue_components (sbitmap components
)
4714 aarch64_process_components (components
, true);
4717 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4720 aarch64_emit_epilogue_components (sbitmap components
)
4722 aarch64_process_components (components
, false);
4725 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4728 aarch64_set_handled_components (sbitmap components
)
4730 for (unsigned regno
= 0; regno
<= LAST_SAVED_REGNUM
; regno
++)
4731 if (bitmap_bit_p (components
, regno
))
4732 cfun
->machine
->reg_is_wrapped_separately
[regno
] = true;
4735 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4736 is saved at BASE + OFFSET. */
4739 aarch64_add_cfa_expression (rtx_insn
*insn
, unsigned int reg
,
4740 rtx base
, poly_int64 offset
)
4742 rtx mem
= gen_frame_mem (DImode
, plus_constant (Pmode
, base
, offset
));
4743 add_reg_note (insn
, REG_CFA_EXPRESSION
,
4744 gen_rtx_SET (mem
, regno_reg_rtx
[reg
]));
4747 /* AArch64 stack frames generated by this compiler look like:
4749 +-------------------------------+
4751 | incoming stack arguments |
4753 +-------------------------------+
4754 | | <-- incoming stack pointer (aligned)
4755 | callee-allocated save area |
4756 | for register varargs |
4758 +-------------------------------+
4759 | local variables | <-- frame_pointer_rtx
4761 +-------------------------------+
4763 +-------------------------------+ |
4764 | callee-saved registers | | frame.saved_regs_size
4765 +-------------------------------+ |
4767 +-------------------------------+ |
4768 | FP' | / <- hard_frame_pointer_rtx (aligned)
4769 +-------------------------------+
4770 | dynamic allocation |
4771 +-------------------------------+
4773 +-------------------------------+
4774 | outgoing stack arguments | <-- arg_pointer
4776 +-------------------------------+
4777 | | <-- stack_pointer_rtx (aligned)
4779 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4780 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4783 /* Generate the prologue instructions for entry into a function.
4784 Establish the stack frame by decreasing the stack pointer with a
4785 properly calculated size and, if necessary, create a frame record
4786 filled with the values of LR and previous frame pointer. The
4787 current FP is also set up if it is in use. */
4790 aarch64_expand_prologue (void)
4792 aarch64_layout_frame ();
4794 poly_int64 frame_size
= cfun
->machine
->frame
.frame_size
;
4795 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4796 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4797 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4798 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4799 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4800 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4801 bool emit_frame_chain
= cfun
->machine
->frame
.emit_frame_chain
;
4804 /* Sign return address for functions. */
4805 if (aarch64_return_address_signing_enabled ())
4807 insn
= emit_insn (gen_pacisp ());
4808 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
4809 RTX_FRAME_RELATED_P (insn
) = 1;
4812 if (flag_stack_usage_info
)
4813 current_function_static_stack_size
= constant_lower_bound (frame_size
);
4815 if (flag_stack_check
== STATIC_BUILTIN_STACK_CHECK
)
4817 if (crtl
->is_leaf
&& !cfun
->calls_alloca
)
4819 if (maybe_gt (frame_size
, PROBE_INTERVAL
)
4820 && maybe_gt (frame_size
, get_stack_check_protect ()))
4821 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4823 - get_stack_check_protect ()));
4825 else if (maybe_gt (frame_size
, 0))
4826 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size
);
4829 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4830 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4832 aarch64_sub_sp (ip0_rtx
, ip1_rtx
, initial_adjust
, true);
4834 if (callee_adjust
!= 0)
4835 aarch64_push_regs (reg1
, reg2
, callee_adjust
);
4837 if (emit_frame_chain
)
4839 poly_int64 reg_offset
= callee_adjust
;
4840 if (callee_adjust
== 0)
4844 reg_offset
= callee_offset
;
4845 aarch64_save_callee_saves (DImode
, reg_offset
, reg1
, reg2
, false);
4847 aarch64_add_offset (Pmode
, hard_frame_pointer_rtx
,
4848 stack_pointer_rtx
, callee_offset
,
4849 ip1_rtx
, ip0_rtx
, frame_pointer_needed
);
4850 if (frame_pointer_needed
&& !frame_size
.is_constant ())
4852 /* Variable-sized frames need to describe the save slot
4853 address using DW_CFA_expression rather than DW_CFA_offset.
4854 This means that, without taking further action, the
4855 locations of the registers that we've already saved would
4856 remain based on the stack pointer even after we redefine
4857 the CFA based on the frame pointer. We therefore need new
4858 DW_CFA_expressions to re-express the save slots with addresses
4859 based on the frame pointer. */
4860 rtx_insn
*insn
= get_last_insn ();
4861 gcc_assert (RTX_FRAME_RELATED_P (insn
));
4863 /* Add an explicit CFA definition if this was previously
4865 if (!find_reg_note (insn
, REG_CFA_ADJUST_CFA
, NULL_RTX
))
4867 rtx src
= plus_constant (Pmode
, stack_pointer_rtx
,
4869 add_reg_note (insn
, REG_CFA_ADJUST_CFA
,
4870 gen_rtx_SET (hard_frame_pointer_rtx
, src
));
4873 /* Change the save slot expressions for the registers that
4874 we've already saved. */
4875 reg_offset
-= callee_offset
;
4876 aarch64_add_cfa_expression (insn
, reg2
, hard_frame_pointer_rtx
,
4877 reg_offset
+ UNITS_PER_WORD
);
4878 aarch64_add_cfa_expression (insn
, reg1
, hard_frame_pointer_rtx
,
4881 emit_insn (gen_stack_tie (stack_pointer_rtx
, hard_frame_pointer_rtx
));
4884 aarch64_save_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4885 callee_adjust
!= 0 || emit_frame_chain
);
4886 aarch64_save_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4887 callee_adjust
!= 0 || emit_frame_chain
);
4888 aarch64_sub_sp (ip1_rtx
, ip0_rtx
, final_adjust
, !frame_pointer_needed
);
4891 /* Return TRUE if we can use a simple_return insn.
4893 This function checks whether the callee saved stack is empty, which
4894 means no restore actions are need. The pro_and_epilogue will use
4895 this to check whether shrink-wrapping opt is feasible. */
4898 aarch64_use_return_insn_p (void)
4900 if (!reload_completed
)
4906 aarch64_layout_frame ();
4908 return known_eq (cfun
->machine
->frame
.frame_size
, 0);
4911 /* Generate the epilogue instructions for returning from a function.
4912 This is almost exactly the reverse of the prolog sequence, except
4913 that we need to insert barriers to avoid scheduling loads that read
4914 from a deallocated stack, and we optimize the unwind records by
4915 emitting them all together if possible. */
4917 aarch64_expand_epilogue (bool for_sibcall
)
4919 aarch64_layout_frame ();
4921 poly_int64 initial_adjust
= cfun
->machine
->frame
.initial_adjust
;
4922 HOST_WIDE_INT callee_adjust
= cfun
->machine
->frame
.callee_adjust
;
4923 poly_int64 final_adjust
= cfun
->machine
->frame
.final_adjust
;
4924 poly_int64 callee_offset
= cfun
->machine
->frame
.callee_offset
;
4925 unsigned reg1
= cfun
->machine
->frame
.wb_candidate1
;
4926 unsigned reg2
= cfun
->machine
->frame
.wb_candidate2
;
4929 /* A stack clash protection prologue may not have left IP0_REGNUM or
4930 IP1_REGNUM in a usable state. The same is true for allocations
4931 with an SVE component, since we then need both temporary registers
4932 for each allocation. */
4933 bool can_inherit_p
= (initial_adjust
.is_constant ()
4934 && final_adjust
.is_constant ()
4935 && !flag_stack_clash_protection
);
4937 /* We need to add memory barrier to prevent read from deallocated stack. */
4939 = maybe_ne (get_frame_size ()
4940 + cfun
->machine
->frame
.saved_varargs_size
, 0);
4942 /* Emit a barrier to prevent loads from a deallocated stack. */
4943 if (maybe_gt (final_adjust
, crtl
->outgoing_args_size
)
4944 || cfun
->calls_alloca
4945 || crtl
->calls_eh_return
)
4947 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
4948 need_barrier_p
= false;
4951 /* Restore the stack pointer from the frame pointer if it may not
4952 be the same as the stack pointer. */
4953 rtx ip0_rtx
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
4954 rtx ip1_rtx
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
4955 if (frame_pointer_needed
4956 && (maybe_ne (final_adjust
, 0) || cfun
->calls_alloca
))
4957 /* If writeback is used when restoring callee-saves, the CFA
4958 is restored on the instruction doing the writeback. */
4959 aarch64_add_offset (Pmode
, stack_pointer_rtx
,
4960 hard_frame_pointer_rtx
, -callee_offset
,
4961 ip1_rtx
, ip0_rtx
, callee_adjust
== 0);
4963 aarch64_add_sp (ip1_rtx
, ip0_rtx
, final_adjust
,
4964 !can_inherit_p
|| df_regs_ever_live_p (IP1_REGNUM
));
4966 aarch64_restore_callee_saves (DImode
, callee_offset
, R0_REGNUM
, R30_REGNUM
,
4967 callee_adjust
!= 0, &cfi_ops
);
4968 aarch64_restore_callee_saves (DFmode
, callee_offset
, V0_REGNUM
, V31_REGNUM
,
4969 callee_adjust
!= 0, &cfi_ops
);
4972 emit_insn (gen_stack_tie (stack_pointer_rtx
, stack_pointer_rtx
));
4974 if (callee_adjust
!= 0)
4975 aarch64_pop_regs (reg1
, reg2
, callee_adjust
, &cfi_ops
);
4977 if (callee_adjust
!= 0 || maybe_gt (initial_adjust
, 65536))
4979 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4980 insn
= get_last_insn ();
4981 rtx new_cfa
= plus_constant (Pmode
, stack_pointer_rtx
, initial_adjust
);
4982 REG_NOTES (insn
) = alloc_reg_note (REG_CFA_DEF_CFA
, new_cfa
, cfi_ops
);
4983 RTX_FRAME_RELATED_P (insn
) = 1;
4987 aarch64_add_sp (ip0_rtx
, ip1_rtx
, initial_adjust
,
4988 !can_inherit_p
|| df_regs_ever_live_p (IP0_REGNUM
));
4992 /* Emit delayed restores and reset the CFA to be SP. */
4993 insn
= get_last_insn ();
4994 cfi_ops
= alloc_reg_note (REG_CFA_DEF_CFA
, stack_pointer_rtx
, cfi_ops
);
4995 REG_NOTES (insn
) = cfi_ops
;
4996 RTX_FRAME_RELATED_P (insn
) = 1;
4999 /* We prefer to emit the combined return/authenticate instruction RETAA,
5000 however there are three cases in which we must instead emit an explicit
5001 authentication instruction.
5003 1) Sibcalls don't return in a normal way, so if we're about to call one
5004 we must authenticate.
5006 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5007 generating code for !TARGET_ARMV8_3 we can't use it and must
5008 explicitly authenticate.
5010 3) On an eh_return path we make extra stack adjustments to update the
5011 canonical frame address to be the exception handler's CFA. We want
5012 to authenticate using the CFA of the function which calls eh_return.
5014 if (aarch64_return_address_signing_enabled ()
5015 && (for_sibcall
|| !TARGET_ARMV8_3
|| crtl
->calls_eh_return
))
5017 insn
= emit_insn (gen_autisp ());
5018 add_reg_note (insn
, REG_CFA_TOGGLE_RA_MANGLE
, const0_rtx
);
5019 RTX_FRAME_RELATED_P (insn
) = 1;
5022 /* Stack adjustment for exception handler. */
5023 if (crtl
->calls_eh_return
)
5025 /* We need to unwind the stack by the offset computed by
5026 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5027 to be SP; letting the CFA move during this adjustment
5028 is just as correct as retaining the CFA from the body
5029 of the function. Therefore, do nothing special. */
5030 emit_insn (gen_add2_insn (stack_pointer_rtx
, EH_RETURN_STACKADJ_RTX
));
5033 emit_use (gen_rtx_REG (DImode
, LR_REGNUM
));
5035 emit_jump_insn (ret_rtx
);
5038 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5039 normally or return to a previous frame after unwinding.
5041 An EH return uses a single shared return sequence. The epilogue is
5042 exactly like a normal epilogue except that it has an extra input
5043 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5044 that must be applied after the frame has been destroyed. An extra label
5045 is inserted before the epilogue which initializes this register to zero,
5046 and this is the entry point for a normal return.
5048 An actual EH return updates the return address, initializes the stack
5049 adjustment and jumps directly into the epilogue (bypassing the zeroing
5050 of the adjustment). Since the return address is typically saved on the
5051 stack when a function makes a call, the saved LR must be updated outside
5054 This poses problems as the store is generated well before the epilogue,
5055 so the offset of LR is not known yet. Also optimizations will remove the
5056 store as it appears dead, even after the epilogue is generated (as the
5057 base or offset for loading LR is different in many cases).
5059 To avoid these problems this implementation forces the frame pointer
5060 in eh_return functions so that the location of LR is fixed and known early.
5061 It also marks the store volatile, so no optimization is permitted to
5062 remove the store. */
5064 aarch64_eh_return_handler_rtx (void)
5066 rtx tmp
= gen_frame_mem (Pmode
,
5067 plus_constant (Pmode
, hard_frame_pointer_rtx
, UNITS_PER_WORD
));
5069 /* Mark the store volatile, so no optimization is permitted to remove it. */
5070 MEM_VOLATILE_P (tmp
) = true;
5074 /* Output code to add DELTA to the first argument, and then jump
5075 to FUNCTION. Used for C++ multiple inheritance. */
5077 aarch64_output_mi_thunk (FILE *file
, tree thunk ATTRIBUTE_UNUSED
,
5078 HOST_WIDE_INT delta
,
5079 HOST_WIDE_INT vcall_offset
,
5082 /* The this pointer is always in x0. Note that this differs from
5083 Arm where the this pointer maybe bumped to r1 if r0 is required
5084 to return a pointer to an aggregate. On AArch64 a result value
5085 pointer will be in x8. */
5086 int this_regno
= R0_REGNUM
;
5087 rtx this_rtx
, temp0
, temp1
, addr
, funexp
;
5090 reload_completed
= 1;
5091 emit_note (NOTE_INSN_PROLOGUE_END
);
5093 this_rtx
= gen_rtx_REG (Pmode
, this_regno
);
5094 temp0
= gen_rtx_REG (Pmode
, IP0_REGNUM
);
5095 temp1
= gen_rtx_REG (Pmode
, IP1_REGNUM
);
5097 if (vcall_offset
== 0)
5098 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
, temp1
, temp0
, false);
5101 gcc_assert ((vcall_offset
& (POINTER_BYTES
- 1)) == 0);
5106 if (delta
>= -256 && delta
< 256)
5107 addr
= gen_rtx_PRE_MODIFY (Pmode
, this_rtx
,
5108 plus_constant (Pmode
, this_rtx
, delta
));
5110 aarch64_add_offset (Pmode
, this_rtx
, this_rtx
, delta
,
5111 temp1
, temp0
, false);
5114 if (Pmode
== ptr_mode
)
5115 aarch64_emit_move (temp0
, gen_rtx_MEM (ptr_mode
, addr
));
5117 aarch64_emit_move (temp0
,
5118 gen_rtx_ZERO_EXTEND (Pmode
,
5119 gen_rtx_MEM (ptr_mode
, addr
)));
5121 if (vcall_offset
>= -256 && vcall_offset
< 4096 * POINTER_BYTES
)
5122 addr
= plus_constant (Pmode
, temp0
, vcall_offset
);
5125 aarch64_internal_mov_immediate (temp1
, GEN_INT (vcall_offset
), true,
5127 addr
= gen_rtx_PLUS (Pmode
, temp0
, temp1
);
5130 if (Pmode
== ptr_mode
)
5131 aarch64_emit_move (temp1
, gen_rtx_MEM (ptr_mode
,addr
));
5133 aarch64_emit_move (temp1
,
5134 gen_rtx_SIGN_EXTEND (Pmode
,
5135 gen_rtx_MEM (ptr_mode
, addr
)));
5137 emit_insn (gen_add2_insn (this_rtx
, temp1
));
5140 /* Generate a tail call to the target function. */
5141 if (!TREE_USED (function
))
5143 assemble_external (function
);
5144 TREE_USED (function
) = 1;
5146 funexp
= XEXP (DECL_RTL (function
), 0);
5147 funexp
= gen_rtx_MEM (FUNCTION_MODE
, funexp
);
5148 insn
= emit_call_insn (gen_sibcall (funexp
, const0_rtx
, NULL_RTX
));
5149 SIBLING_CALL_P (insn
) = 1;
5151 insn
= get_insns ();
5152 shorten_branches (insn
);
5153 final_start_function (insn
, file
, 1);
5154 final (insn
, file
, 1);
5155 final_end_function ();
5157 /* Stop pretending to be a post-reload pass. */
5158 reload_completed
= 0;
5162 aarch64_tls_referenced_p (rtx x
)
5164 if (!TARGET_HAVE_TLS
)
5166 subrtx_iterator::array_type array
;
5167 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5169 const_rtx x
= *iter
;
5170 if (GET_CODE (x
) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x
) != 0)
5172 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5173 TLS offsets, not real symbol references. */
5174 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
5175 iter
.skip_subrtxes ();
5181 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5182 a left shift of 0 or 12 bits. */
5184 aarch64_uimm12_shift (HOST_WIDE_INT val
)
5186 return ((val
& (((HOST_WIDE_INT
) 0xfff) << 0)) == val
5187 || (val
& (((HOST_WIDE_INT
) 0xfff) << 12)) == val
5192 /* Return true if val is an immediate that can be loaded into a
5193 register by a MOVZ instruction. */
5195 aarch64_movw_imm (HOST_WIDE_INT val
, scalar_int_mode mode
)
5197 if (GET_MODE_SIZE (mode
) > 4)
5199 if ((val
& (((HOST_WIDE_INT
) 0xffff) << 32)) == val
5200 || (val
& (((HOST_WIDE_INT
) 0xffff) << 48)) == val
)
5205 /* Ignore sign extension. */
5206 val
&= (HOST_WIDE_INT
) 0xffffffff;
5208 return ((val
& (((HOST_WIDE_INT
) 0xffff) << 0)) == val
5209 || (val
& (((HOST_WIDE_INT
) 0xffff) << 16)) == val
);
5212 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5213 64-bit (DImode) integer. */
5215 static unsigned HOST_WIDE_INT
5216 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val
, machine_mode mode
)
5218 unsigned int size
= GET_MODE_UNIT_PRECISION (mode
);
5221 val
&= (HOST_WIDE_INT_1U
<< size
) - 1;
5228 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5230 static const unsigned HOST_WIDE_INT bitmask_imm_mul
[] =
5232 0x0000000100000001ull
,
5233 0x0001000100010001ull
,
5234 0x0101010101010101ull
,
5235 0x1111111111111111ull
,
5236 0x5555555555555555ull
,
5240 /* Return true if val is a valid bitmask immediate. */
5243 aarch64_bitmask_imm (HOST_WIDE_INT val_in
, machine_mode mode
)
5245 unsigned HOST_WIDE_INT val
, tmp
, mask
, first_one
, next_one
;
5248 /* Check for a single sequence of one bits and return quickly if so.
5249 The special cases of all ones and all zeroes returns false. */
5250 val
= aarch64_replicate_bitmask_imm (val_in
, mode
);
5251 tmp
= val
+ (val
& -val
);
5253 if (tmp
== (tmp
& -tmp
))
5254 return (val
+ 1) > 1;
5256 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5258 val
= (val
<< 32) | (val
& 0xffffffff);
5260 /* Invert if the immediate doesn't start with a zero bit - this means we
5261 only need to search for sequences of one bits. */
5265 /* Find the first set bit and set tmp to val with the first sequence of one
5266 bits removed. Return success if there is a single sequence of ones. */
5267 first_one
= val
& -val
;
5268 tmp
= val
& (val
+ first_one
);
5273 /* Find the next set bit and compute the difference in bit position. */
5274 next_one
= tmp
& -tmp
;
5275 bits
= clz_hwi (first_one
) - clz_hwi (next_one
);
5278 /* Check the bit position difference is a power of 2, and that the first
5279 sequence of one bits fits within 'bits' bits. */
5280 if ((mask
>> bits
) != 0 || bits
!= (bits
& -bits
))
5283 /* Check the sequence of one bits is repeated 64/bits times. */
5284 return val
== mask
* bitmask_imm_mul
[__builtin_clz (bits
) - 26];
5287 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5288 Assumed precondition: VAL_IN Is not zero. */
5290 unsigned HOST_WIDE_INT
5291 aarch64_and_split_imm1 (HOST_WIDE_INT val_in
)
5293 int lowest_bit_set
= ctz_hwi (val_in
);
5294 int highest_bit_set
= floor_log2 (val_in
);
5295 gcc_assert (val_in
!= 0);
5297 return ((HOST_WIDE_INT_UC (2) << highest_bit_set
) -
5298 (HOST_WIDE_INT_1U
<< lowest_bit_set
));
5301 /* Create constant where bits outside of lowest bit set to highest bit set
5304 unsigned HOST_WIDE_INT
5305 aarch64_and_split_imm2 (HOST_WIDE_INT val_in
)
5307 return val_in
| ~aarch64_and_split_imm1 (val_in
);
5310 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5313 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in
, machine_mode mode
)
5315 scalar_int_mode int_mode
;
5316 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5319 if (aarch64_bitmask_imm (val_in
, int_mode
))
5322 if (aarch64_move_imm (val_in
, int_mode
))
5325 unsigned HOST_WIDE_INT imm2
= aarch64_and_split_imm2 (val_in
);
5327 return aarch64_bitmask_imm (imm2
, int_mode
);
5330 /* Return true if val is an immediate that can be loaded into a
5331 register in a single instruction. */
5333 aarch64_move_imm (HOST_WIDE_INT val
, machine_mode mode
)
5335 scalar_int_mode int_mode
;
5336 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
5339 if (aarch64_movw_imm (val
, int_mode
) || aarch64_movw_imm (~val
, int_mode
))
5341 return aarch64_bitmask_imm (val
, int_mode
);
5345 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
, rtx x
)
5349 if (GET_CODE (x
) == HIGH
)
5352 /* There's no way to calculate VL-based values using relocations. */
5353 subrtx_iterator::array_type array
;
5354 FOR_EACH_SUBRTX (iter
, array
, x
, ALL
)
5355 if (GET_CODE (*iter
) == CONST_POLY_INT
)
5358 split_const (x
, &base
, &offset
);
5359 if (GET_CODE (base
) == SYMBOL_REF
|| GET_CODE (base
) == LABEL_REF
)
5361 if (aarch64_classify_symbol (base
, INTVAL (offset
))
5362 != SYMBOL_FORCE_TO_MEM
)
5365 /* Avoid generating a 64-bit relocation in ILP32; leave
5366 to aarch64_expand_mov_immediate to handle it properly. */
5367 return mode
!= ptr_mode
;
5370 return aarch64_tls_referenced_p (x
);
5373 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5374 The expansion for a table switch is quite expensive due to the number
5375 of instructions, the table lookup and hard to predict indirect jump.
5376 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5377 set, otherwise use tables for > 16 cases as a tradeoff between size and
5378 performance. When optimizing for size, use the default setting. */
5381 aarch64_case_values_threshold (void)
5383 /* Use the specified limit for the number of cases before using jump
5384 tables at higher optimization levels. */
5386 && selected_cpu
->tune
->max_case_values
!= 0)
5387 return selected_cpu
->tune
->max_case_values
;
5389 return optimize_size
? default_case_values_threshold () : 17;
5392 /* Return true if register REGNO is a valid index register.
5393 STRICT_P is true if REG_OK_STRICT is in effect. */
5396 aarch64_regno_ok_for_index_p (int regno
, bool strict_p
)
5398 if (!HARD_REGISTER_NUM_P (regno
))
5406 regno
= reg_renumber
[regno
];
5408 return GP_REGNUM_P (regno
);
5411 /* Return true if register REGNO is a valid base register for mode MODE.
5412 STRICT_P is true if REG_OK_STRICT is in effect. */
5415 aarch64_regno_ok_for_base_p (int regno
, bool strict_p
)
5417 if (!HARD_REGISTER_NUM_P (regno
))
5425 regno
= reg_renumber
[regno
];
5428 /* The fake registers will be eliminated to either the stack or
5429 hard frame pointer, both of which are usually valid base registers.
5430 Reload deals with the cases where the eliminated form isn't valid. */
5431 return (GP_REGNUM_P (regno
)
5432 || regno
== SP_REGNUM
5433 || regno
== FRAME_POINTER_REGNUM
5434 || regno
== ARG_POINTER_REGNUM
);
5437 /* Return true if X is a valid base register for mode MODE.
5438 STRICT_P is true if REG_OK_STRICT is in effect. */
5441 aarch64_base_register_rtx_p (rtx x
, bool strict_p
)
5444 && GET_CODE (x
) == SUBREG
5445 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (x
))])
5448 return (REG_P (x
) && aarch64_regno_ok_for_base_p (REGNO (x
), strict_p
));
5451 /* Return true if address offset is a valid index. If it is, fill in INFO
5452 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5455 aarch64_classify_index (struct aarch64_address_info
*info
, rtx x
,
5456 machine_mode mode
, bool strict_p
)
5458 enum aarch64_address_type type
;
5463 if ((REG_P (x
) || GET_CODE (x
) == SUBREG
)
5464 && GET_MODE (x
) == Pmode
)
5466 type
= ADDRESS_REG_REG
;
5470 /* (sign_extend:DI (reg:SI)) */
5471 else if ((GET_CODE (x
) == SIGN_EXTEND
5472 || GET_CODE (x
) == ZERO_EXTEND
)
5473 && GET_MODE (x
) == DImode
5474 && GET_MODE (XEXP (x
, 0)) == SImode
)
5476 type
= (GET_CODE (x
) == SIGN_EXTEND
)
5477 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5478 index
= XEXP (x
, 0);
5481 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5482 else if (GET_CODE (x
) == MULT
5483 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5484 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5485 && GET_MODE (XEXP (x
, 0)) == DImode
5486 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5487 && CONST_INT_P (XEXP (x
, 1)))
5489 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5490 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5491 index
= XEXP (XEXP (x
, 0), 0);
5492 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5494 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5495 else if (GET_CODE (x
) == ASHIFT
5496 && (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
5497 || GET_CODE (XEXP (x
, 0)) == ZERO_EXTEND
)
5498 && GET_MODE (XEXP (x
, 0)) == DImode
5499 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == SImode
5500 && CONST_INT_P (XEXP (x
, 1)))
5502 type
= (GET_CODE (XEXP (x
, 0)) == SIGN_EXTEND
)
5503 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5504 index
= XEXP (XEXP (x
, 0), 0);
5505 shift
= INTVAL (XEXP (x
, 1));
5507 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5508 else if ((GET_CODE (x
) == SIGN_EXTRACT
5509 || GET_CODE (x
) == ZERO_EXTRACT
)
5510 && GET_MODE (x
) == DImode
5511 && GET_CODE (XEXP (x
, 0)) == MULT
5512 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5513 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5515 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5516 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5517 index
= XEXP (XEXP (x
, 0), 0);
5518 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5519 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5520 || INTVAL (XEXP (x
, 2)) != 0)
5523 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5524 (const_int 0xffffffff<<shift)) */
5525 else if (GET_CODE (x
) == AND
5526 && GET_MODE (x
) == DImode
5527 && GET_CODE (XEXP (x
, 0)) == MULT
5528 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5529 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5530 && CONST_INT_P (XEXP (x
, 1)))
5532 type
= ADDRESS_REG_UXTW
;
5533 index
= XEXP (XEXP (x
, 0), 0);
5534 shift
= exact_log2 (INTVAL (XEXP (XEXP (x
, 0), 1)));
5535 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5538 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5539 else if ((GET_CODE (x
) == SIGN_EXTRACT
5540 || GET_CODE (x
) == ZERO_EXTRACT
)
5541 && GET_MODE (x
) == DImode
5542 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5543 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5544 && CONST_INT_P (XEXP (XEXP (x
, 0), 1)))
5546 type
= (GET_CODE (x
) == SIGN_EXTRACT
)
5547 ? ADDRESS_REG_SXTW
: ADDRESS_REG_UXTW
;
5548 index
= XEXP (XEXP (x
, 0), 0);
5549 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5550 if (INTVAL (XEXP (x
, 1)) != 32 + shift
5551 || INTVAL (XEXP (x
, 2)) != 0)
5554 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5555 (const_int 0xffffffff<<shift)) */
5556 else if (GET_CODE (x
) == AND
5557 && GET_MODE (x
) == DImode
5558 && GET_CODE (XEXP (x
, 0)) == ASHIFT
5559 && GET_MODE (XEXP (XEXP (x
, 0), 0)) == DImode
5560 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
5561 && CONST_INT_P (XEXP (x
, 1)))
5563 type
= ADDRESS_REG_UXTW
;
5564 index
= XEXP (XEXP (x
, 0), 0);
5565 shift
= INTVAL (XEXP (XEXP (x
, 0), 1));
5566 if (INTVAL (XEXP (x
, 1)) != (HOST_WIDE_INT
)0xffffffff << shift
)
5569 /* (mult:P (reg:P) (const_int scale)) */
5570 else if (GET_CODE (x
) == MULT
5571 && GET_MODE (x
) == Pmode
5572 && GET_MODE (XEXP (x
, 0)) == Pmode
5573 && CONST_INT_P (XEXP (x
, 1)))
5575 type
= ADDRESS_REG_REG
;
5576 index
= XEXP (x
, 0);
5577 shift
= exact_log2 (INTVAL (XEXP (x
, 1)));
5579 /* (ashift:P (reg:P) (const_int shift)) */
5580 else if (GET_CODE (x
) == ASHIFT
5581 && GET_MODE (x
) == Pmode
5582 && GET_MODE (XEXP (x
, 0)) == Pmode
5583 && CONST_INT_P (XEXP (x
, 1)))
5585 type
= ADDRESS_REG_REG
;
5586 index
= XEXP (x
, 0);
5587 shift
= INTVAL (XEXP (x
, 1));
5593 && GET_CODE (index
) == SUBREG
5594 && contains_reg_of_mode
[GENERAL_REGS
][GET_MODE (SUBREG_REG (index
))])
5595 index
= SUBREG_REG (index
);
5597 if (aarch64_sve_data_mode_p (mode
))
5599 if (type
!= ADDRESS_REG_REG
5600 || (1 << shift
) != GET_MODE_UNIT_SIZE (mode
))
5606 && !(IN_RANGE (shift
, 1, 3)
5607 && known_eq (1 << shift
, GET_MODE_SIZE (mode
))))
5612 && aarch64_regno_ok_for_index_p (REGNO (index
), strict_p
))
5615 info
->offset
= index
;
5616 info
->shift
= shift
;
5623 /* Return true if MODE is one of the modes for which we
5624 support LDP/STP operations. */
5627 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode
)
5629 return mode
== SImode
|| mode
== DImode
5630 || mode
== SFmode
|| mode
== DFmode
5631 || (aarch64_vector_mode_supported_p (mode
)
5632 && (known_eq (GET_MODE_SIZE (mode
), 8)
5633 || (known_eq (GET_MODE_SIZE (mode
), 16)
5634 && (aarch64_tune_params
.extra_tuning_flags
5635 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
) == 0)));
5638 /* Return true if REGNO is a virtual pointer register, or an eliminable
5639 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5640 include stack_pointer or hard_frame_pointer. */
5642 virt_or_elim_regno_p (unsigned regno
)
5644 return ((regno
>= FIRST_VIRTUAL_REGISTER
5645 && regno
<= LAST_VIRTUAL_POINTER_REGISTER
)
5646 || regno
== FRAME_POINTER_REGNUM
5647 || regno
== ARG_POINTER_REGNUM
);
5650 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5651 If it is, fill in INFO appropriately. STRICT_P is true if
5652 REG_OK_STRICT is in effect. */
5655 aarch64_classify_address (struct aarch64_address_info
*info
,
5656 rtx x
, machine_mode mode
, bool strict_p
,
5657 aarch64_addr_query_type type
= ADDR_QUERY_M
)
5659 enum rtx_code code
= GET_CODE (x
);
5663 HOST_WIDE_INT const_size
;
5665 /* On BE, we use load/store pair for all large int mode load/stores.
5666 TI/TFmode may also use a load/store pair. */
5667 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
5668 bool advsimd_struct_p
= (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
));
5669 bool load_store_pair_p
= (type
== ADDR_QUERY_LDP_STP
5670 || type
== ADDR_QUERY_LDP_STP_N
5673 || (BYTES_BIG_ENDIAN
&& advsimd_struct_p
));
5675 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5676 corresponds to the actual size of the memory being loaded/stored and the
5677 mode of the corresponding addressing mode is half of that. */
5678 if (type
== ADDR_QUERY_LDP_STP_N
5679 && known_eq (GET_MODE_SIZE (mode
), 16))
5682 bool allow_reg_index_p
= (!load_store_pair_p
5683 && (known_lt (GET_MODE_SIZE (mode
), 16)
5684 || vec_flags
== VEC_ADVSIMD
5685 || vec_flags
== VEC_SVE_DATA
));
5687 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5688 [Rn, #offset, MUL VL]. */
5689 if ((vec_flags
& (VEC_SVE_DATA
| VEC_SVE_PRED
)) != 0
5690 && (code
!= REG
&& code
!= PLUS
))
5693 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5695 if (advsimd_struct_p
5696 && !BYTES_BIG_ENDIAN
5697 && (code
!= POST_INC
&& code
!= REG
))
5700 gcc_checking_assert (GET_MODE (x
) == VOIDmode
5701 || SCALAR_INT_MODE_P (GET_MODE (x
)));
5707 info
->type
= ADDRESS_REG_IMM
;
5709 info
->offset
= const0_rtx
;
5710 info
->const_offset
= 0;
5711 return aarch64_base_register_rtx_p (x
, strict_p
);
5719 && virt_or_elim_regno_p (REGNO (op0
))
5720 && poly_int_rtx_p (op1
, &offset
))
5722 info
->type
= ADDRESS_REG_IMM
;
5725 info
->const_offset
= offset
;
5730 if (maybe_ne (GET_MODE_SIZE (mode
), 0)
5731 && aarch64_base_register_rtx_p (op0
, strict_p
)
5732 && poly_int_rtx_p (op1
, &offset
))
5734 info
->type
= ADDRESS_REG_IMM
;
5737 info
->const_offset
= offset
;
5739 /* TImode and TFmode values are allowed in both pairs of X
5740 registers and individual Q registers. The available
5742 X,X: 7-bit signed scaled offset
5743 Q: 9-bit signed offset
5744 We conservatively require an offset representable in either mode.
5745 When performing the check for pairs of X registers i.e. LDP/STP
5746 pass down DImode since that is the natural size of the LDP/STP
5747 instruction memory accesses. */
5748 if (mode
== TImode
|| mode
== TFmode
)
5749 return (aarch64_offset_7bit_signed_scaled_p (DImode
, offset
)
5750 && (offset_9bit_signed_unscaled_p (mode
, offset
)
5751 || offset_12bit_unsigned_scaled_p (mode
, offset
)));
5753 /* A 7bit offset check because OImode will emit a ldp/stp
5754 instruction (only big endian will get here).
5755 For ldp/stp instructions, the offset is scaled for the size of a
5756 single element of the pair. */
5758 return aarch64_offset_7bit_signed_scaled_p (TImode
, offset
);
5760 /* Three 9/12 bit offsets checks because CImode will emit three
5761 ldr/str instructions (only big endian will get here). */
5763 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5764 && (offset_9bit_signed_unscaled_p (V16QImode
, offset
+ 32)
5765 || offset_12bit_unsigned_scaled_p (V16QImode
,
5768 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5769 instructions (only big endian will get here). */
5771 return (aarch64_offset_7bit_signed_scaled_p (TImode
, offset
)
5772 && aarch64_offset_7bit_signed_scaled_p (TImode
,
5775 /* Make "m" use the LD1 offset range for SVE data modes, so
5776 that pre-RTL optimizers like ivopts will work to that
5777 instead of the wider LDR/STR range. */
5778 if (vec_flags
== VEC_SVE_DATA
)
5779 return (type
== ADDR_QUERY_M
5780 ? offset_4bit_signed_scaled_p (mode
, offset
)
5781 : offset_9bit_signed_scaled_p (mode
, offset
));
5783 if (vec_flags
== (VEC_SVE_DATA
| VEC_STRUCT
))
5785 poly_int64 end_offset
= (offset
5786 + GET_MODE_SIZE (mode
)
5787 - BYTES_PER_SVE_VECTOR
);
5788 return (type
== ADDR_QUERY_M
5789 ? offset_4bit_signed_scaled_p (mode
, offset
)
5790 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE
, offset
)
5791 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE
,
5795 if (vec_flags
== VEC_SVE_PRED
)
5796 return offset_9bit_signed_scaled_p (mode
, offset
);
5798 if (load_store_pair_p
)
5799 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5800 || known_eq (GET_MODE_SIZE (mode
), 8)
5801 || known_eq (GET_MODE_SIZE (mode
), 16))
5802 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5804 return (offset_9bit_signed_unscaled_p (mode
, offset
)
5805 || offset_12bit_unsigned_scaled_p (mode
, offset
));
5808 if (allow_reg_index_p
)
5810 /* Look for base + (scaled/extended) index register. */
5811 if (aarch64_base_register_rtx_p (op0
, strict_p
)
5812 && aarch64_classify_index (info
, op1
, mode
, strict_p
))
5817 if (aarch64_base_register_rtx_p (op1
, strict_p
)
5818 && aarch64_classify_index (info
, op0
, mode
, strict_p
))
5831 info
->type
= ADDRESS_REG_WB
;
5832 info
->base
= XEXP (x
, 0);
5833 info
->offset
= NULL_RTX
;
5834 return aarch64_base_register_rtx_p (info
->base
, strict_p
);
5838 info
->type
= ADDRESS_REG_WB
;
5839 info
->base
= XEXP (x
, 0);
5840 if (GET_CODE (XEXP (x
, 1)) == PLUS
5841 && poly_int_rtx_p (XEXP (XEXP (x
, 1), 1), &offset
)
5842 && rtx_equal_p (XEXP (XEXP (x
, 1), 0), info
->base
)
5843 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5845 info
->offset
= XEXP (XEXP (x
, 1), 1);
5846 info
->const_offset
= offset
;
5848 /* TImode and TFmode values are allowed in both pairs of X
5849 registers and individual Q registers. The available
5851 X,X: 7-bit signed scaled offset
5852 Q: 9-bit signed offset
5853 We conservatively require an offset representable in either mode.
5855 if (mode
== TImode
|| mode
== TFmode
)
5856 return (aarch64_offset_7bit_signed_scaled_p (mode
, offset
)
5857 && offset_9bit_signed_unscaled_p (mode
, offset
));
5859 if (load_store_pair_p
)
5860 return ((known_eq (GET_MODE_SIZE (mode
), 4)
5861 || known_eq (GET_MODE_SIZE (mode
), 8)
5862 || known_eq (GET_MODE_SIZE (mode
), 16))
5863 && aarch64_offset_7bit_signed_scaled_p (mode
, offset
));
5865 return offset_9bit_signed_unscaled_p (mode
, offset
);
5872 /* load literal: pc-relative constant pool entry. Only supported
5873 for SI mode or larger. */
5874 info
->type
= ADDRESS_SYMBOLIC
;
5876 if (!load_store_pair_p
5877 && GET_MODE_SIZE (mode
).is_constant (&const_size
)
5882 split_const (x
, &sym
, &addend
);
5883 return ((GET_CODE (sym
) == LABEL_REF
5884 || (GET_CODE (sym
) == SYMBOL_REF
5885 && CONSTANT_POOL_ADDRESS_P (sym
)
5886 && aarch64_pcrelative_literal_loads
)));
5891 info
->type
= ADDRESS_LO_SUM
;
5892 info
->base
= XEXP (x
, 0);
5893 info
->offset
= XEXP (x
, 1);
5894 if (allow_reg_index_p
5895 && aarch64_base_register_rtx_p (info
->base
, strict_p
))
5898 split_const (info
->offset
, &sym
, &offs
);
5899 if (GET_CODE (sym
) == SYMBOL_REF
5900 && (aarch64_classify_symbol (sym
, INTVAL (offs
))
5901 == SYMBOL_SMALL_ABSOLUTE
))
5903 /* The symbol and offset must be aligned to the access size. */
5906 if (CONSTANT_POOL_ADDRESS_P (sym
))
5907 align
= GET_MODE_ALIGNMENT (get_pool_mode (sym
));
5908 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym
))
5910 tree exp
= SYMBOL_REF_DECL (sym
);
5911 align
= TYPE_ALIGN (TREE_TYPE (exp
));
5912 align
= aarch64_constant_alignment (exp
, align
);
5914 else if (SYMBOL_REF_DECL (sym
))
5915 align
= DECL_ALIGN (SYMBOL_REF_DECL (sym
));
5916 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym
)
5917 && SYMBOL_REF_BLOCK (sym
) != NULL
)
5918 align
= SYMBOL_REF_BLOCK (sym
)->alignment
;
5920 align
= BITS_PER_UNIT
;
5922 poly_int64 ref_size
= GET_MODE_SIZE (mode
);
5923 if (known_eq (ref_size
, 0))
5924 ref_size
= GET_MODE_SIZE (DImode
);
5926 return (multiple_p (INTVAL (offs
), ref_size
)
5927 && multiple_p (align
/ BITS_PER_UNIT
, ref_size
));
5937 /* Return true if the address X is valid for a PRFM instruction.
5938 STRICT_P is true if we should do strict checking with
5939 aarch64_classify_address. */
5942 aarch64_address_valid_for_prefetch_p (rtx x
, bool strict_p
)
5944 struct aarch64_address_info addr
;
5946 /* PRFM accepts the same addresses as DImode... */
5947 bool res
= aarch64_classify_address (&addr
, x
, DImode
, strict_p
);
5951 /* ... except writeback forms. */
5952 return addr
.type
!= ADDRESS_REG_WB
;
5956 aarch64_symbolic_address_p (rtx x
)
5960 split_const (x
, &x
, &offset
);
5961 return GET_CODE (x
) == SYMBOL_REF
|| GET_CODE (x
) == LABEL_REF
;
5964 /* Classify the base of symbolic expression X. */
5966 enum aarch64_symbol_type
5967 aarch64_classify_symbolic_expression (rtx x
)
5971 split_const (x
, &x
, &offset
);
5972 return aarch64_classify_symbol (x
, INTVAL (offset
));
5976 /* Return TRUE if X is a legitimate address for accessing memory in
5979 aarch64_legitimate_address_hook_p (machine_mode mode
, rtx x
, bool strict_p
)
5981 struct aarch64_address_info addr
;
5983 return aarch64_classify_address (&addr
, x
, mode
, strict_p
);
5986 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5987 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5989 aarch64_legitimate_address_p (machine_mode mode
, rtx x
, bool strict_p
,
5990 aarch64_addr_query_type type
)
5992 struct aarch64_address_info addr
;
5994 return aarch64_classify_address (&addr
, x
, mode
, strict_p
, type
);
5997 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6000 aarch64_legitimize_address_displacement (rtx
*offset1
, rtx
*offset2
,
6001 poly_int64 orig_offset
,
6005 if (GET_MODE_SIZE (mode
).is_constant (&size
))
6007 HOST_WIDE_INT const_offset
, second_offset
;
6009 /* A general SVE offset is A * VQ + B. Remove the A component from
6010 coefficient 0 in order to get the constant B. */
6011 const_offset
= orig_offset
.coeffs
[0] - orig_offset
.coeffs
[1];
6013 /* Split an out-of-range address displacement into a base and
6014 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6015 range otherwise to increase opportunities for sharing the base
6016 address of different sizes. Unaligned accesses use the signed
6017 9-bit range, TImode/TFmode use the intersection of signed
6018 scaled 7-bit and signed 9-bit offset. */
6019 if (mode
== TImode
|| mode
== TFmode
)
6020 second_offset
= ((const_offset
+ 0x100) & 0x1f8) - 0x100;
6021 else if ((const_offset
& (size
- 1)) != 0)
6022 second_offset
= ((const_offset
+ 0x100) & 0x1ff) - 0x100;
6024 second_offset
= const_offset
& (size
< 4 ? 0xfff : 0x3ffc);
6026 if (second_offset
== 0 || known_eq (orig_offset
, second_offset
))
6029 /* Split the offset into second_offset and the rest. */
6030 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6031 *offset2
= gen_int_mode (second_offset
, Pmode
);
6036 /* Get the mode we should use as the basis of the range. For structure
6037 modes this is the mode of one vector. */
6038 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
6039 machine_mode step_mode
6040 = (vec_flags
& VEC_STRUCT
) != 0 ? SVE_BYTE_MODE
: mode
;
6042 /* Get the "mul vl" multiplier we'd like to use. */
6043 HOST_WIDE_INT factor
= GET_MODE_SIZE (step_mode
).coeffs
[1];
6044 HOST_WIDE_INT vnum
= orig_offset
.coeffs
[1] / factor
;
6045 if (vec_flags
& VEC_SVE_DATA
)
6046 /* LDR supports a 9-bit range, but the move patterns for
6047 structure modes require all vectors to be in range of the
6048 same base. The simplest way of accomodating that while still
6049 promoting reuse of anchor points between different modes is
6050 to use an 8-bit range unconditionally. */
6051 vnum
= ((vnum
+ 128) & 255) - 128;
6053 /* Predicates are only handled singly, so we might as well use
6055 vnum
= ((vnum
+ 256) & 511) - 256;
6059 /* Convert the "mul vl" multiplier into a byte offset. */
6060 poly_int64 second_offset
= GET_MODE_SIZE (step_mode
) * vnum
;
6061 if (known_eq (second_offset
, orig_offset
))
6064 /* Split the offset into second_offset and the rest. */
6065 *offset1
= gen_int_mode (orig_offset
- second_offset
, Pmode
);
6066 *offset2
= gen_int_mode (second_offset
, Pmode
);
6071 /* Return the binary representation of floating point constant VALUE in INTVAL.
6072 If the value cannot be converted, return false without setting INTVAL.
6073 The conversion is done in the given MODE. */
6075 aarch64_reinterpret_float_as_int (rtx value
, unsigned HOST_WIDE_INT
*intval
)
6078 /* We make a general exception for 0. */
6079 if (aarch64_float_const_zero_rtx_p (value
))
6085 scalar_float_mode mode
;
6086 if (GET_CODE (value
) != CONST_DOUBLE
6087 || !is_a
<scalar_float_mode
> (GET_MODE (value
), &mode
)
6088 || GET_MODE_BITSIZE (mode
) > HOST_BITS_PER_WIDE_INT
6089 /* Only support up to DF mode. */
6090 || GET_MODE_BITSIZE (mode
) > GET_MODE_BITSIZE (DFmode
))
6093 unsigned HOST_WIDE_INT ival
= 0;
6096 real_to_target (res
,
6097 CONST_DOUBLE_REAL_VALUE (value
),
6098 REAL_MODE_FORMAT (mode
));
6102 int order
= BYTES_BIG_ENDIAN
? 1 : 0;
6103 ival
= zext_hwi (res
[order
], 32);
6104 ival
|= (zext_hwi (res
[1 - order
], 32) << 32);
6107 ival
= zext_hwi (res
[0], 32);
6113 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6114 single MOV(+MOVK) followed by an FMOV. */
6116 aarch64_float_const_rtx_p (rtx x
)
6118 machine_mode mode
= GET_MODE (x
);
6119 if (mode
== VOIDmode
)
6122 /* Determine whether it's cheaper to write float constants as
6123 mov/movk pairs over ldr/adrp pairs. */
6124 unsigned HOST_WIDE_INT ival
;
6126 if (GET_CODE (x
) == CONST_DOUBLE
6127 && SCALAR_FLOAT_MODE_P (mode
)
6128 && aarch64_reinterpret_float_as_int (x
, &ival
))
6130 scalar_int_mode imode
= (mode
== HFmode
6132 : int_mode_for_mode (mode
).require ());
6133 int num_instr
= aarch64_internal_mov_immediate
6134 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
6135 return num_instr
< 3;
6141 /* Return TRUE if rtx X is immediate constant 0.0 */
6143 aarch64_float_const_zero_rtx_p (rtx x
)
6145 if (GET_MODE (x
) == VOIDmode
)
6148 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x
)))
6149 return !HONOR_SIGNED_ZEROS (GET_MODE (x
));
6150 return real_equal (CONST_DOUBLE_REAL_VALUE (x
), &dconst0
);
6153 /* Return TRUE if rtx X is immediate constant that fits in a single
6154 MOVI immediate operation. */
6156 aarch64_can_const_movi_rtx_p (rtx x
, machine_mode mode
)
6162 scalar_int_mode imode
;
6163 unsigned HOST_WIDE_INT ival
;
6165 if (GET_CODE (x
) == CONST_DOUBLE
6166 && SCALAR_FLOAT_MODE_P (mode
))
6168 if (!aarch64_reinterpret_float_as_int (x
, &ival
))
6171 /* We make a general exception for 0. */
6172 if (aarch64_float_const_zero_rtx_p (x
))
6175 imode
= int_mode_for_mode (mode
).require ();
6177 else if (GET_CODE (x
) == CONST_INT
6178 && is_a
<scalar_int_mode
> (mode
, &imode
))
6183 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6184 a 128 bit vector mode. */
6185 int width
= GET_MODE_BITSIZE (imode
) == 64 ? 128 : 64;
6187 vmode
= aarch64_simd_container_mode (imode
, width
);
6188 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, ival
);
6190 return aarch64_simd_valid_immediate (v_op
, NULL
);
6194 /* Return the fixed registers used for condition codes. */
6197 aarch64_fixed_condition_code_regs (unsigned int *p1
, unsigned int *p2
)
6200 *p2
= INVALID_REGNUM
;
6204 /* This function is used by the call expanders of the machine description.
6205 RESULT is the register in which the result is returned. It's NULL for
6206 "call" and "sibcall".
6207 MEM is the location of the function call.
6208 SIBCALL indicates whether this function call is normal call or sibling call.
6209 It will generate different pattern accordingly. */
6212 aarch64_expand_call (rtx result
, rtx mem
, bool sibcall
)
6214 rtx call
, callee
, tmp
;
6218 gcc_assert (MEM_P (mem
));
6219 callee
= XEXP (mem
, 0);
6220 mode
= GET_MODE (callee
);
6221 gcc_assert (mode
== Pmode
);
6223 /* Decide if we should generate indirect calls by loading the
6224 address of the callee into a register before performing
6225 the branch-and-link. */
6226 if (SYMBOL_REF_P (callee
)
6227 ? (aarch64_is_long_call_p (callee
)
6228 || aarch64_is_noplt_call_p (callee
))
6230 XEXP (mem
, 0) = force_reg (mode
, callee
);
6232 call
= gen_rtx_CALL (VOIDmode
, mem
, const0_rtx
);
6234 if (result
!= NULL_RTX
)
6235 call
= gen_rtx_SET (result
, call
);
6240 tmp
= gen_rtx_CLOBBER (VOIDmode
, gen_rtx_REG (Pmode
, LR_REGNUM
));
6242 vec
= gen_rtvec (2, call
, tmp
);
6243 call
= gen_rtx_PARALLEL (VOIDmode
, vec
);
6245 aarch64_emit_call_insn (call
);
6248 /* Emit call insn with PAT and do aarch64-specific handling. */
6251 aarch64_emit_call_insn (rtx pat
)
6253 rtx insn
= emit_call_insn (pat
);
6255 rtx
*fusage
= &CALL_INSN_FUNCTION_USAGE (insn
);
6256 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP0_REGNUM
));
6257 clobber_reg (fusage
, gen_rtx_REG (word_mode
, IP1_REGNUM
));
6261 aarch64_select_cc_mode (RTX_CODE code
, rtx x
, rtx y
)
6263 /* All floating point compares return CCFP if it is an equality
6264 comparison, and CCFPE otherwise. */
6265 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_FLOAT
)
6292 /* Equality comparisons of short modes against zero can be performed
6293 using the TST instruction with the appropriate bitmask. */
6294 if (y
== const0_rtx
&& REG_P (x
)
6295 && (code
== EQ
|| code
== NE
)
6296 && (GET_MODE (x
) == HImode
|| GET_MODE (x
) == QImode
))
6299 /* Similarly, comparisons of zero_extends from shorter modes can
6300 be performed using an ANDS with an immediate mask. */
6301 if (y
== const0_rtx
&& GET_CODE (x
) == ZERO_EXTEND
6302 && (GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6303 && (GET_MODE (XEXP (x
, 0)) == HImode
|| GET_MODE (XEXP (x
, 0)) == QImode
)
6304 && (code
== EQ
|| code
== NE
))
6307 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6309 && (code
== EQ
|| code
== NE
|| code
== LT
|| code
== GE
)
6310 && (GET_CODE (x
) == PLUS
|| GET_CODE (x
) == MINUS
|| GET_CODE (x
) == AND
6311 || GET_CODE (x
) == NEG
6312 || (GET_CODE (x
) == ZERO_EXTRACT
&& CONST_INT_P (XEXP (x
, 1))
6313 && CONST_INT_P (XEXP (x
, 2)))))
6316 /* A compare with a shifted operand. Because of canonicalization,
6317 the comparison will have to be swapped when we emit the assembly
6319 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6320 && (REG_P (y
) || GET_CODE (y
) == SUBREG
|| y
== const0_rtx
)
6321 && (GET_CODE (x
) == ASHIFT
|| GET_CODE (x
) == ASHIFTRT
6322 || GET_CODE (x
) == LSHIFTRT
6323 || GET_CODE (x
) == ZERO_EXTEND
|| GET_CODE (x
) == SIGN_EXTEND
))
6326 /* Similarly for a negated operand, but we can only do this for
6328 if ((GET_MODE (x
) == SImode
|| GET_MODE (x
) == DImode
)
6329 && (REG_P (y
) || GET_CODE (y
) == SUBREG
)
6330 && (code
== EQ
|| code
== NE
)
6331 && GET_CODE (x
) == NEG
)
6334 /* A test for unsigned overflow. */
6335 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6337 && GET_CODE (x
) == PLUS
6338 && GET_CODE (y
) == ZERO_EXTEND
)
6341 /* A test for signed overflow. */
6342 if ((GET_MODE (x
) == DImode
|| GET_MODE (x
) == TImode
)
6344 && GET_CODE (x
) == PLUS
6345 && GET_CODE (y
) == SIGN_EXTEND
)
6348 /* For everything else, return CCmode. */
6353 aarch64_get_condition_code_1 (machine_mode
, enum rtx_code
);
6356 aarch64_get_condition_code (rtx x
)
6358 machine_mode mode
= GET_MODE (XEXP (x
, 0));
6359 enum rtx_code comp_code
= GET_CODE (x
);
6361 if (GET_MODE_CLASS (mode
) != MODE_CC
)
6362 mode
= SELECT_CC_MODE (comp_code
, XEXP (x
, 0), XEXP (x
, 1));
6363 return aarch64_get_condition_code_1 (mode
, comp_code
);
6367 aarch64_get_condition_code_1 (machine_mode mode
, enum rtx_code comp_code
)
6375 case GE
: return AARCH64_GE
;
6376 case GT
: return AARCH64_GT
;
6377 case LE
: return AARCH64_LS
;
6378 case LT
: return AARCH64_MI
;
6379 case NE
: return AARCH64_NE
;
6380 case EQ
: return AARCH64_EQ
;
6381 case ORDERED
: return AARCH64_VC
;
6382 case UNORDERED
: return AARCH64_VS
;
6383 case UNLT
: return AARCH64_LT
;
6384 case UNLE
: return AARCH64_LE
;
6385 case UNGT
: return AARCH64_HI
;
6386 case UNGE
: return AARCH64_PL
;
6394 case NE
: return AARCH64_NE
;
6395 case EQ
: return AARCH64_EQ
;
6396 case GE
: return AARCH64_GE
;
6397 case GT
: return AARCH64_GT
;
6398 case LE
: return AARCH64_LE
;
6399 case LT
: return AARCH64_LT
;
6400 case GEU
: return AARCH64_CS
;
6401 case GTU
: return AARCH64_HI
;
6402 case LEU
: return AARCH64_LS
;
6403 case LTU
: return AARCH64_CC
;
6411 case NE
: return AARCH64_NE
;
6412 case EQ
: return AARCH64_EQ
;
6413 case GE
: return AARCH64_LE
;
6414 case GT
: return AARCH64_LT
;
6415 case LE
: return AARCH64_GE
;
6416 case LT
: return AARCH64_GT
;
6417 case GEU
: return AARCH64_LS
;
6418 case GTU
: return AARCH64_CC
;
6419 case LEU
: return AARCH64_CS
;
6420 case LTU
: return AARCH64_HI
;
6428 case NE
: return AARCH64_NE
;
6429 case EQ
: return AARCH64_EQ
;
6430 case GE
: return AARCH64_PL
;
6431 case LT
: return AARCH64_MI
;
6439 case NE
: return AARCH64_NE
;
6440 case EQ
: return AARCH64_EQ
;
6448 case NE
: return AARCH64_CS
;
6449 case EQ
: return AARCH64_CC
;
6457 case NE
: return AARCH64_VS
;
6458 case EQ
: return AARCH64_VC
;
6471 aarch64_const_vec_all_same_in_range_p (rtx x
,
6472 HOST_WIDE_INT minval
,
6473 HOST_WIDE_INT maxval
)
6476 return (const_vec_duplicate_p (x
, &elt
)
6477 && CONST_INT_P (elt
)
6478 && IN_RANGE (INTVAL (elt
), minval
, maxval
));
6482 aarch64_const_vec_all_same_int_p (rtx x
, HOST_WIDE_INT val
)
6484 return aarch64_const_vec_all_same_in_range_p (x
, val
, val
);
6487 /* Return true if VEC is a constant in which every element is in the range
6488 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6491 aarch64_const_vec_all_in_range_p (rtx vec
,
6492 HOST_WIDE_INT minval
,
6493 HOST_WIDE_INT maxval
)
6495 if (GET_CODE (vec
) != CONST_VECTOR
6496 || GET_MODE_CLASS (GET_MODE (vec
)) != MODE_VECTOR_INT
)
6500 if (!CONST_VECTOR_STEPPED_P (vec
))
6501 nunits
= const_vector_encoded_nelts (vec
);
6502 else if (!CONST_VECTOR_NUNITS (vec
).is_constant (&nunits
))
6505 for (int i
= 0; i
< nunits
; i
++)
6507 rtx vec_elem
= CONST_VECTOR_ELT (vec
, i
);
6508 if (!CONST_INT_P (vec_elem
)
6509 || !IN_RANGE (INTVAL (vec_elem
), minval
, maxval
))
6516 #define AARCH64_CC_V 1
6517 #define AARCH64_CC_C (1 << 1)
6518 #define AARCH64_CC_Z (1 << 2)
6519 #define AARCH64_CC_N (1 << 3)
6521 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6522 static const int aarch64_nzcv_codes
[] =
6524 0, /* EQ, Z == 1. */
6525 AARCH64_CC_Z
, /* NE, Z == 0. */
6526 0, /* CS, C == 1. */
6527 AARCH64_CC_C
, /* CC, C == 0. */
6528 0, /* MI, N == 1. */
6529 AARCH64_CC_N
, /* PL, N == 0. */
6530 0, /* VS, V == 1. */
6531 AARCH64_CC_V
, /* VC, V == 0. */
6532 0, /* HI, C ==1 && Z == 0. */
6533 AARCH64_CC_C
, /* LS, !(C == 1 && Z == 0). */
6534 AARCH64_CC_V
, /* GE, N == V. */
6535 0, /* LT, N != V. */
6536 AARCH64_CC_Z
, /* GT, Z == 0 && N == V. */
6537 0, /* LE, !(Z == 0 && N == V). */
6542 /* Print floating-point vector immediate operand X to F, negating it
6543 first if NEGATE is true. Return true on success, false if it isn't
6544 a constant we can handle. */
6547 aarch64_print_vector_float_operand (FILE *f
, rtx x
, bool negate
)
6551 if (!const_vec_duplicate_p (x
, &elt
))
6554 REAL_VALUE_TYPE r
= *CONST_DOUBLE_REAL_VALUE (elt
);
6556 r
= real_value_negate (&r
);
6558 /* We only handle the SVE single-bit immediates here. */
6559 if (real_equal (&r
, &dconst0
))
6560 asm_fprintf (f
, "0.0");
6561 else if (real_equal (&r
, &dconst1
))
6562 asm_fprintf (f
, "1.0");
6563 else if (real_equal (&r
, &dconsthalf
))
6564 asm_fprintf (f
, "0.5");
6571 /* Return the equivalent letter for size. */
6573 sizetochar (int size
)
6577 case 64: return 'd';
6578 case 32: return 's';
6579 case 16: return 'h';
6580 case 8 : return 'b';
6581 default: gcc_unreachable ();
6585 /* Print operand X to file F in a target specific manner according to CODE.
6586 The acceptable formatting commands given by CODE are:
6587 'c': An integer or symbol address without a preceding #
6589 'C': Take the duplicated element in a vector constant
6590 and print it in hex.
6591 'D': Take the duplicated element in a vector constant
6592 and print it as an unsigned integer, in decimal.
6593 'e': Print the sign/zero-extend size as a character 8->b,
6595 'p': Prints N such that 2^N == X (X must be power of 2 and
6597 'P': Print the number of non-zero bits in X (a const_int).
6598 'H': Print the higher numbered register of a pair (TImode)
6600 'm': Print a condition (eq, ne, etc).
6601 'M': Same as 'm', but invert condition.
6602 'N': Take the duplicated element in a vector constant
6603 and print the negative of it in decimal.
6604 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6605 'S/T/U/V': Print a FP/SIMD register name for a register list.
6606 The register printed is the FP/SIMD register name
6607 of X + 0/1/2/3 for S/T/U/V.
6608 'R': Print a scalar FP/SIMD register name + 1.
6609 'X': Print bottom 16 bits of integer constant in hex.
6610 'w/x': Print a general register name or the zero register
6612 '0': Print a normal operand, if it's a general register,
6613 then we assume DImode.
6614 'k': Print NZCV for conditional compare instructions.
6615 'A': Output address constant representing the first
6616 argument of X, specifying a relocation offset
6618 'L': Output constant address specified by X
6619 with a relocation offset if appropriate.
6620 'G': Prints address of X, specifying a PC relative
6621 relocation mode if appropriate.
6622 'y': Output address of LDP or STP - this is used for
6623 some LDP/STPs which don't use a PARALLEL in their
6624 pattern (so the mode needs to be adjusted).
6625 'z': Output address of a typical LDP or STP. */
6628 aarch64_print_operand (FILE *f
, rtx x
, int code
)
6634 switch (GET_CODE (x
))
6637 fprintf (f
, HOST_WIDE_INT_PRINT_DEC
, INTVAL (x
));
6641 output_addr_const (f
, x
);
6645 if (GET_CODE (XEXP (x
, 0)) == PLUS
6646 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
)
6648 output_addr_const (f
, x
);
6654 output_operand_lossage ("unsupported operand for code '%c'", code
);
6662 if (!CONST_INT_P (x
)
6663 || (n
= exact_log2 (INTVAL (x
) & ~7)) <= 0)
6665 output_operand_lossage ("invalid operand for '%%%c'", code
);
6681 output_operand_lossage ("invalid operand for '%%%c'", code
);
6691 if (!CONST_INT_P (x
) || (n
= exact_log2 (INTVAL (x
))) < 0)
6693 output_operand_lossage ("invalid operand for '%%%c'", code
);
6697 asm_fprintf (f
, "%d", n
);
6702 if (!CONST_INT_P (x
))
6704 output_operand_lossage ("invalid operand for '%%%c'", code
);
6708 asm_fprintf (f
, "%u", popcount_hwi (INTVAL (x
)));
6712 if (x
== const0_rtx
)
6714 asm_fprintf (f
, "xzr");
6718 if (!REG_P (x
) || !GP_REGNUM_P (REGNO (x
) + 1))
6720 output_operand_lossage ("invalid operand for '%%%c'", code
);
6724 asm_fprintf (f
, "%s", reg_names
[REGNO (x
) + 1]);
6731 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6732 if (x
== const_true_rtx
)
6739 if (!COMPARISON_P (x
))
6741 output_operand_lossage ("invalid operand for '%%%c'", code
);
6745 cond_code
= aarch64_get_condition_code (x
);
6746 gcc_assert (cond_code
>= 0);
6748 cond_code
= AARCH64_INVERSE_CONDITION_CODE (cond_code
);
6749 fputs (aarch64_condition_codes
[cond_code
], f
);
6754 if (!const_vec_duplicate_p (x
, &elt
))
6756 output_operand_lossage ("invalid vector constant");
6760 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6761 asm_fprintf (f
, "%wd", -INTVAL (elt
));
6762 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6763 && aarch64_print_vector_float_operand (f
, x
, true))
6767 output_operand_lossage ("invalid vector constant");
6777 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6779 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6782 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - V0_REGNUM
);
6789 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6791 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6794 asm_fprintf (f
, "%c%d",
6795 aarch64_sve_data_mode_p (GET_MODE (x
)) ? 'z' : 'v',
6796 REGNO (x
) - V0_REGNUM
+ (code
- 'S'));
6800 if (!REG_P (x
) || !FP_REGNUM_P (REGNO (x
)))
6802 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code
);
6805 asm_fprintf (f
, "q%d", REGNO (x
) - V0_REGNUM
+ 1);
6809 if (!CONST_INT_P (x
))
6811 output_operand_lossage ("invalid operand for '%%%c'", code
);
6814 asm_fprintf (f
, "0x%wx", UINTVAL (x
) & 0xffff);
6819 /* Print a replicated constant in hex. */
6820 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6822 output_operand_lossage ("invalid operand for '%%%c'", code
);
6825 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6826 asm_fprintf (f
, "0x%wx", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6832 /* Print a replicated constant in decimal, treating it as
6834 if (!const_vec_duplicate_p (x
, &elt
) || !CONST_INT_P (elt
))
6836 output_operand_lossage ("invalid operand for '%%%c'", code
);
6839 scalar_mode inner_mode
= GET_MODE_INNER (GET_MODE (x
));
6840 asm_fprintf (f
, "%wd", UINTVAL (elt
) & GET_MODE_MASK (inner_mode
));
6847 || (CONST_DOUBLE_P (x
) && aarch64_float_const_zero_rtx_p (x
)))
6849 asm_fprintf (f
, "%czr", code
);
6853 if (REG_P (x
) && GP_REGNUM_P (REGNO (x
)))
6855 asm_fprintf (f
, "%c%d", code
, REGNO (x
) - R0_REGNUM
);
6859 if (REG_P (x
) && REGNO (x
) == SP_REGNUM
)
6861 asm_fprintf (f
, "%ssp", code
== 'w' ? "w" : "");
6870 output_operand_lossage ("missing operand");
6874 switch (GET_CODE (x
))
6877 if (aarch64_sve_data_mode_p (GET_MODE (x
)))
6879 if (REG_NREGS (x
) == 1)
6880 asm_fprintf (f
, "z%d", REGNO (x
) - V0_REGNUM
);
6884 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x
)));
6885 asm_fprintf (f
, "{z%d.%c - z%d.%c}",
6886 REGNO (x
) - V0_REGNUM
, suffix
,
6887 END_REGNO (x
) - V0_REGNUM
- 1, suffix
);
6891 asm_fprintf (f
, "%s", reg_names
[REGNO (x
)]);
6895 output_address (GET_MODE (x
), XEXP (x
, 0));
6900 output_addr_const (asm_out_file
, x
);
6904 asm_fprintf (f
, "%wd", INTVAL (x
));
6908 if (!VECTOR_MODE_P (GET_MODE (x
)))
6910 output_addr_const (asm_out_file
, x
);
6916 if (!const_vec_duplicate_p (x
, &elt
))
6918 output_operand_lossage ("invalid vector constant");
6922 if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_INT
)
6923 asm_fprintf (f
, "%wd", INTVAL (elt
));
6924 else if (GET_MODE_CLASS (GET_MODE (x
)) == MODE_VECTOR_FLOAT
6925 && aarch64_print_vector_float_operand (f
, x
, false))
6929 output_operand_lossage ("invalid vector constant");
6935 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6936 be getting CONST_DOUBLEs holding integers. */
6937 gcc_assert (GET_MODE (x
) != VOIDmode
);
6938 if (aarch64_float_const_zero_rtx_p (x
))
6943 else if (aarch64_float_const_representable_p (x
))
6946 char float_buf
[buf_size
] = {'\0'};
6947 real_to_decimal_for_mode (float_buf
,
6948 CONST_DOUBLE_REAL_VALUE (x
),
6951 asm_fprintf (asm_out_file
, "%s", float_buf
);
6955 output_operand_lossage ("invalid constant");
6958 output_operand_lossage ("invalid operand");
6964 if (GET_CODE (x
) == HIGH
)
6967 switch (aarch64_classify_symbolic_expression (x
))
6969 case SYMBOL_SMALL_GOT_4G
:
6970 asm_fprintf (asm_out_file
, ":got:");
6973 case SYMBOL_SMALL_TLSGD
:
6974 asm_fprintf (asm_out_file
, ":tlsgd:");
6977 case SYMBOL_SMALL_TLSDESC
:
6978 asm_fprintf (asm_out_file
, ":tlsdesc:");
6981 case SYMBOL_SMALL_TLSIE
:
6982 asm_fprintf (asm_out_file
, ":gottprel:");
6985 case SYMBOL_TLSLE24
:
6986 asm_fprintf (asm_out_file
, ":tprel:");
6989 case SYMBOL_TINY_GOT
:
6996 output_addr_const (asm_out_file
, x
);
7000 switch (aarch64_classify_symbolic_expression (x
))
7002 case SYMBOL_SMALL_GOT_4G
:
7003 asm_fprintf (asm_out_file
, ":lo12:");
7006 case SYMBOL_SMALL_TLSGD
:
7007 asm_fprintf (asm_out_file
, ":tlsgd_lo12:");
7010 case SYMBOL_SMALL_TLSDESC
:
7011 asm_fprintf (asm_out_file
, ":tlsdesc_lo12:");
7014 case SYMBOL_SMALL_TLSIE
:
7015 asm_fprintf (asm_out_file
, ":gottprel_lo12:");
7018 case SYMBOL_TLSLE12
:
7019 asm_fprintf (asm_out_file
, ":tprel_lo12:");
7022 case SYMBOL_TLSLE24
:
7023 asm_fprintf (asm_out_file
, ":tprel_lo12_nc:");
7026 case SYMBOL_TINY_GOT
:
7027 asm_fprintf (asm_out_file
, ":got:");
7030 case SYMBOL_TINY_TLSIE
:
7031 asm_fprintf (asm_out_file
, ":gottprel:");
7037 output_addr_const (asm_out_file
, x
);
7041 switch (aarch64_classify_symbolic_expression (x
))
7043 case SYMBOL_TLSLE24
:
7044 asm_fprintf (asm_out_file
, ":tprel_hi12:");
7049 output_addr_const (asm_out_file
, x
);
7054 HOST_WIDE_INT cond_code
;
7056 if (!CONST_INT_P (x
))
7058 output_operand_lossage ("invalid operand for '%%%c'", code
);
7062 cond_code
= INTVAL (x
);
7063 gcc_assert (cond_code
>= 0 && cond_code
<= AARCH64_NV
);
7064 asm_fprintf (f
, "%d", aarch64_nzcv_codes
[cond_code
]);
7071 machine_mode mode
= GET_MODE (x
);
7073 if (GET_CODE (x
) != MEM
7074 || (code
== 'y' && maybe_ne (GET_MODE_SIZE (mode
), 16)))
7076 output_operand_lossage ("invalid operand for '%%%c'", code
);
7080 if (!aarch64_print_address_internal (f
, mode
, XEXP (x
, 0),
7082 ? ADDR_QUERY_LDP_STP_N
7083 : ADDR_QUERY_LDP_STP
))
7084 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7089 output_operand_lossage ("invalid operand prefix '%%%c'", code
);
7094 /* Print address 'x' of a memory access with mode 'mode'.
7095 'op' is the context required by aarch64_classify_address. It can either be
7096 MEM for a normal memory access or PARALLEL for LDP/STP. */
7098 aarch64_print_address_internal (FILE *f
, machine_mode mode
, rtx x
,
7099 aarch64_addr_query_type type
)
7101 struct aarch64_address_info addr
;
7104 /* Check all addresses are Pmode - including ILP32. */
7105 if (GET_MODE (x
) != Pmode
)
7106 output_operand_lossage ("invalid address mode");
7108 if (aarch64_classify_address (&addr
, x
, mode
, true, type
))
7111 case ADDRESS_REG_IMM
:
7112 if (known_eq (addr
.const_offset
, 0))
7113 asm_fprintf (f
, "[%s]", reg_names
[REGNO (addr
.base
)]);
7114 else if (aarch64_sve_data_mode_p (mode
))
7117 = exact_div (addr
.const_offset
,
7118 BYTES_PER_SVE_VECTOR
).to_constant ();
7119 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7120 reg_names
[REGNO (addr
.base
)], vnum
);
7122 else if (aarch64_sve_pred_mode_p (mode
))
7125 = exact_div (addr
.const_offset
,
7126 BYTES_PER_SVE_PRED
).to_constant ();
7127 asm_fprintf (f
, "[%s, #%wd, mul vl]",
7128 reg_names
[REGNO (addr
.base
)], vnum
);
7131 asm_fprintf (f
, "[%s, %wd]", reg_names
[REGNO (addr
.base
)],
7132 INTVAL (addr
.offset
));
7135 case ADDRESS_REG_REG
:
7136 if (addr
.shift
== 0)
7137 asm_fprintf (f
, "[%s, %s]", reg_names
[REGNO (addr
.base
)],
7138 reg_names
[REGNO (addr
.offset
)]);
7140 asm_fprintf (f
, "[%s, %s, lsl %u]", reg_names
[REGNO (addr
.base
)],
7141 reg_names
[REGNO (addr
.offset
)], addr
.shift
);
7144 case ADDRESS_REG_UXTW
:
7145 if (addr
.shift
== 0)
7146 asm_fprintf (f
, "[%s, w%d, uxtw]", reg_names
[REGNO (addr
.base
)],
7147 REGNO (addr
.offset
) - R0_REGNUM
);
7149 asm_fprintf (f
, "[%s, w%d, uxtw %u]", reg_names
[REGNO (addr
.base
)],
7150 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7153 case ADDRESS_REG_SXTW
:
7154 if (addr
.shift
== 0)
7155 asm_fprintf (f
, "[%s, w%d, sxtw]", reg_names
[REGNO (addr
.base
)],
7156 REGNO (addr
.offset
) - R0_REGNUM
);
7158 asm_fprintf (f
, "[%s, w%d, sxtw %u]", reg_names
[REGNO (addr
.base
)],
7159 REGNO (addr
.offset
) - R0_REGNUM
, addr
.shift
);
7162 case ADDRESS_REG_WB
:
7163 /* Writeback is only supported for fixed-width modes. */
7164 size
= GET_MODE_SIZE (mode
).to_constant ();
7165 switch (GET_CODE (x
))
7168 asm_fprintf (f
, "[%s, %d]!", reg_names
[REGNO (addr
.base
)], size
);
7171 asm_fprintf (f
, "[%s], %d", reg_names
[REGNO (addr
.base
)], size
);
7174 asm_fprintf (f
, "[%s, -%d]!", reg_names
[REGNO (addr
.base
)], size
);
7177 asm_fprintf (f
, "[%s], -%d", reg_names
[REGNO (addr
.base
)], size
);
7180 asm_fprintf (f
, "[%s, %wd]!", reg_names
[REGNO (addr
.base
)],
7181 INTVAL (addr
.offset
));
7184 asm_fprintf (f
, "[%s], %wd", reg_names
[REGNO (addr
.base
)],
7185 INTVAL (addr
.offset
));
7192 case ADDRESS_LO_SUM
:
7193 asm_fprintf (f
, "[%s, #:lo12:", reg_names
[REGNO (addr
.base
)]);
7194 output_addr_const (f
, addr
.offset
);
7195 asm_fprintf (f
, "]");
7198 case ADDRESS_SYMBOLIC
:
7199 output_addr_const (f
, x
);
7206 /* Print address 'x' of a memory access with mode 'mode'. */
7208 aarch64_print_operand_address (FILE *f
, machine_mode mode
, rtx x
)
7210 if (!aarch64_print_address_internal (f
, mode
, x
, ADDR_QUERY_ANY
))
7211 output_addr_const (f
, x
);
7215 aarch64_label_mentioned_p (rtx x
)
7220 if (GET_CODE (x
) == LABEL_REF
)
7223 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7224 referencing instruction, but they are constant offsets, not
7226 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_TLS
)
7229 fmt
= GET_RTX_FORMAT (GET_CODE (x
));
7230 for (i
= GET_RTX_LENGTH (GET_CODE (x
)) - 1; i
>= 0; i
--)
7236 for (j
= XVECLEN (x
, i
) - 1; j
>= 0; j
--)
7237 if (aarch64_label_mentioned_p (XVECEXP (x
, i
, j
)))
7240 else if (fmt
[i
] == 'e' && aarch64_label_mentioned_p (XEXP (x
, i
)))
7247 /* Implement REGNO_REG_CLASS. */
7250 aarch64_regno_regclass (unsigned regno
)
7252 if (GP_REGNUM_P (regno
))
7253 return GENERAL_REGS
;
7255 if (regno
== SP_REGNUM
)
7258 if (regno
== FRAME_POINTER_REGNUM
7259 || regno
== ARG_POINTER_REGNUM
)
7260 return POINTER_REGS
;
7262 if (FP_REGNUM_P (regno
))
7263 return FP_LO_REGNUM_P (regno
) ? FP_LO_REGS
: FP_REGS
;
7265 if (PR_REGNUM_P (regno
))
7266 return PR_LO_REGNUM_P (regno
) ? PR_LO_REGS
: PR_HI_REGS
;
7271 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7272 If OFFSET is out of range, return an offset of an anchor point
7273 that is in range. Return 0 otherwise. */
7275 static HOST_WIDE_INT
7276 aarch64_anchor_offset (HOST_WIDE_INT offset
, HOST_WIDE_INT size
,
7279 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7281 return (offset
+ 0x400) & ~0x7f0;
7283 /* For offsets that aren't a multiple of the access size, the limit is
7285 if (offset
& (size
- 1))
7287 /* BLKmode typically uses LDP of X-registers. */
7288 if (mode
== BLKmode
)
7289 return (offset
+ 512) & ~0x3ff;
7290 return (offset
+ 0x100) & ~0x1ff;
7293 /* Small negative offsets are supported. */
7294 if (IN_RANGE (offset
, -256, 0))
7297 if (mode
== TImode
|| mode
== TFmode
)
7298 return (offset
+ 0x100) & ~0x1ff;
7300 /* Use 12-bit offset by access size. */
7301 return offset
& (~0xfff * size
);
7305 aarch64_legitimize_address (rtx x
, rtx
/* orig_x */, machine_mode mode
)
7307 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7308 where mask is selected by alignment and size of the offset.
7309 We try to pick as large a range for the offset as possible to
7310 maximize the chance of a CSE. However, for aligned addresses
7311 we limit the range to 4k so that structures with different sized
7312 elements are likely to use the same base. We need to be careful
7313 not to split a CONST for some forms of address expression, otherwise
7314 it will generate sub-optimal code. */
7316 if (GET_CODE (x
) == PLUS
&& CONST_INT_P (XEXP (x
, 1)))
7318 rtx base
= XEXP (x
, 0);
7319 rtx offset_rtx
= XEXP (x
, 1);
7320 HOST_WIDE_INT offset
= INTVAL (offset_rtx
);
7322 if (GET_CODE (base
) == PLUS
)
7324 rtx op0
= XEXP (base
, 0);
7325 rtx op1
= XEXP (base
, 1);
7327 /* Force any scaling into a temp for CSE. */
7328 op0
= force_reg (Pmode
, op0
);
7329 op1
= force_reg (Pmode
, op1
);
7331 /* Let the pointer register be in op0. */
7332 if (REG_POINTER (op1
))
7333 std::swap (op0
, op1
);
7335 /* If the pointer is virtual or frame related, then we know that
7336 virtual register instantiation or register elimination is going
7337 to apply a second constant. We want the two constants folded
7338 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7339 if (virt_or_elim_regno_p (REGNO (op0
)))
7341 base
= expand_binop (Pmode
, add_optab
, op0
, offset_rtx
,
7342 NULL_RTX
, true, OPTAB_DIRECT
);
7343 return gen_rtx_PLUS (Pmode
, base
, op1
);
7346 /* Otherwise, in order to encourage CSE (and thence loop strength
7347 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7348 base
= expand_binop (Pmode
, add_optab
, op0
, op1
,
7349 NULL_RTX
, true, OPTAB_DIRECT
);
7350 x
= gen_rtx_PLUS (Pmode
, base
, offset_rtx
);
7354 if (GET_MODE_SIZE (mode
).is_constant (&size
))
7356 HOST_WIDE_INT base_offset
= aarch64_anchor_offset (offset
, size
,
7358 if (base_offset
!= 0)
7360 base
= plus_constant (Pmode
, base
, base_offset
);
7361 base
= force_operand (base
, NULL_RTX
);
7362 return plus_constant (Pmode
, base
, offset
- base_offset
);
7371 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED
, rtx x
,
7374 secondary_reload_info
*sri
)
7376 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7377 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7378 comment at the head of aarch64-sve.md for more details about the
7379 big-endian handling. */
7380 if (BYTES_BIG_ENDIAN
7381 && reg_class_subset_p (rclass
, FP_REGS
)
7382 && !((REG_P (x
) && HARD_REGISTER_P (x
))
7383 || aarch64_simd_valid_immediate (x
, NULL
))
7384 && aarch64_sve_data_mode_p (mode
))
7386 sri
->icode
= CODE_FOR_aarch64_sve_reload_be
;
7390 /* If we have to disable direct literal pool loads and stores because the
7391 function is too big, then we need a scratch register. */
7392 if (MEM_P (x
) && GET_CODE (x
) == SYMBOL_REF
&& CONSTANT_POOL_ADDRESS_P (x
)
7393 && (SCALAR_FLOAT_MODE_P (GET_MODE (x
))
7394 || targetm
.vector_mode_supported_p (GET_MODE (x
)))
7395 && !aarch64_pcrelative_literal_loads
)
7397 sri
->icode
= code_for_aarch64_reload_movcp (mode
, DImode
);
7401 /* Without the TARGET_SIMD instructions we cannot move a Q register
7402 to a Q register directly. We need a scratch. */
7403 if (REG_P (x
) && (mode
== TFmode
|| mode
== TImode
) && mode
== GET_MODE (x
)
7404 && FP_REGNUM_P (REGNO (x
)) && !TARGET_SIMD
7405 && reg_class_subset_p (rclass
, FP_REGS
))
7407 sri
->icode
= code_for_aarch64_reload_mov (mode
);
7411 /* A TFmode or TImode memory access should be handled via an FP_REGS
7412 because AArch64 has richer addressing modes for LDR/STR instructions
7413 than LDP/STP instructions. */
7414 if (TARGET_FLOAT
&& rclass
== GENERAL_REGS
7415 && known_eq (GET_MODE_SIZE (mode
), 16) && MEM_P (x
))
7418 if (rclass
== FP_REGS
&& (mode
== TImode
|| mode
== TFmode
) && CONSTANT_P(x
))
7419 return GENERAL_REGS
;
7425 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED
, const int to
)
7427 gcc_assert (from
== ARG_POINTER_REGNUM
|| from
== FRAME_POINTER_REGNUM
);
7429 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7430 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7431 if (frame_pointer_needed
)
7432 return to
== HARD_FRAME_POINTER_REGNUM
;
7437 aarch64_initial_elimination_offset (unsigned from
, unsigned to
)
7439 aarch64_layout_frame ();
7441 if (to
== HARD_FRAME_POINTER_REGNUM
)
7443 if (from
== ARG_POINTER_REGNUM
)
7444 return cfun
->machine
->frame
.hard_fp_offset
;
7446 if (from
== FRAME_POINTER_REGNUM
)
7447 return cfun
->machine
->frame
.hard_fp_offset
7448 - cfun
->machine
->frame
.locals_offset
;
7451 if (to
== STACK_POINTER_REGNUM
)
7453 if (from
== FRAME_POINTER_REGNUM
)
7454 return cfun
->machine
->frame
.frame_size
7455 - cfun
->machine
->frame
.locals_offset
;
7458 return cfun
->machine
->frame
.frame_size
;
7461 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7465 aarch64_return_addr (int count
, rtx frame ATTRIBUTE_UNUSED
)
7469 return get_hard_reg_initial_val (Pmode
, LR_REGNUM
);
7474 aarch64_asm_trampoline_template (FILE *f
)
7478 asm_fprintf (f
, "\tldr\tw%d, .+16\n", IP1_REGNUM
- R0_REGNUM
);
7479 asm_fprintf (f
, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM
- R0_REGNUM
);
7483 asm_fprintf (f
, "\tldr\t%s, .+16\n", reg_names
[IP1_REGNUM
]);
7484 asm_fprintf (f
, "\tldr\t%s, .+20\n", reg_names
[STATIC_CHAIN_REGNUM
]);
7486 asm_fprintf (f
, "\tbr\t%s\n", reg_names
[IP1_REGNUM
]);
7487 assemble_aligned_integer (4, const0_rtx
);
7488 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7489 assemble_aligned_integer (POINTER_BYTES
, const0_rtx
);
7493 aarch64_trampoline_init (rtx m_tramp
, tree fndecl
, rtx chain_value
)
7495 rtx fnaddr
, mem
, a_tramp
;
7496 const int tramp_code_sz
= 16;
7498 /* Don't need to copy the trailing D-words, we fill those in below. */
7499 emit_block_move (m_tramp
, assemble_trampoline_template (),
7500 GEN_INT (tramp_code_sz
), BLOCK_OP_NORMAL
);
7501 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
);
7502 fnaddr
= XEXP (DECL_RTL (fndecl
), 0);
7503 if (GET_MODE (fnaddr
) != ptr_mode
)
7504 fnaddr
= convert_memory_address (ptr_mode
, fnaddr
);
7505 emit_move_insn (mem
, fnaddr
);
7507 mem
= adjust_address (m_tramp
, ptr_mode
, tramp_code_sz
+ POINTER_BYTES
);
7508 emit_move_insn (mem
, chain_value
);
7510 /* XXX We should really define a "clear_cache" pattern and use
7511 gen_clear_cache(). */
7512 a_tramp
= XEXP (m_tramp
, 0);
7513 emit_library_call (gen_rtx_SYMBOL_REF (Pmode
, "__clear_cache"),
7514 LCT_NORMAL
, VOIDmode
, a_tramp
, ptr_mode
,
7515 plus_constant (ptr_mode
, a_tramp
, TRAMPOLINE_SIZE
),
7519 static unsigned char
7520 aarch64_class_max_nregs (reg_class_t regclass
, machine_mode mode
)
7522 /* ??? Logically we should only need to provide a value when
7523 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7524 can hold MODE, but at the moment we need to handle all modes.
7525 Just ignore any runtime parts for registers that can't store them. */
7526 HOST_WIDE_INT lowest_size
= constant_lower_bound (GET_MODE_SIZE (mode
));
7530 case TAILCALL_ADDR_REGS
:
7534 case POINTER_AND_FP_REGS
:
7537 if (aarch64_sve_data_mode_p (mode
)
7538 && constant_multiple_p (GET_MODE_SIZE (mode
),
7539 BYTES_PER_SVE_VECTOR
, &nregs
))
7541 return (aarch64_vector_data_mode_p (mode
)
7542 ? CEIL (lowest_size
, UNITS_PER_VREG
)
7543 : CEIL (lowest_size
, UNITS_PER_WORD
));
7560 aarch64_preferred_reload_class (rtx x
, reg_class_t regclass
)
7562 if (regclass
== POINTER_REGS
)
7563 return GENERAL_REGS
;
7565 if (regclass
== STACK_REG
)
7568 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x
)), POINTER_REGS
))
7574 /* Register eliminiation can result in a request for
7575 SP+constant->FP_REGS. We cannot support such operations which
7576 use SP as source and an FP_REG as destination, so reject out
7578 if (! reg_class_subset_p (regclass
, GENERAL_REGS
) && GET_CODE (x
) == PLUS
)
7580 rtx lhs
= XEXP (x
, 0);
7582 /* Look through a possible SUBREG introduced by ILP32. */
7583 if (GET_CODE (lhs
) == SUBREG
)
7584 lhs
= SUBREG_REG (lhs
);
7586 gcc_assert (REG_P (lhs
));
7587 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs
)),
7596 aarch64_asm_output_labelref (FILE* f
, const char *name
)
7598 asm_fprintf (f
, "%U%s", name
);
7602 aarch64_elf_asm_constructor (rtx symbol
, int priority
)
7604 if (priority
== DEFAULT_INIT_PRIORITY
)
7605 default_ctor_section_asm_out_constructor (symbol
, priority
);
7609 /* While priority is known to be in range [0, 65535], so 18 bytes
7610 would be enough, the compiler might not know that. To avoid
7611 -Wformat-truncation false positive, use a larger size. */
7613 snprintf (buf
, sizeof (buf
), ".init_array.%.5u", priority
);
7614 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7615 switch_to_section (s
);
7616 assemble_align (POINTER_SIZE
);
7617 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7622 aarch64_elf_asm_destructor (rtx symbol
, int priority
)
7624 if (priority
== DEFAULT_INIT_PRIORITY
)
7625 default_dtor_section_asm_out_destructor (symbol
, priority
);
7629 /* While priority is known to be in range [0, 65535], so 18 bytes
7630 would be enough, the compiler might not know that. To avoid
7631 -Wformat-truncation false positive, use a larger size. */
7633 snprintf (buf
, sizeof (buf
), ".fini_array.%.5u", priority
);
7634 s
= get_section (buf
, SECTION_WRITE
| SECTION_NOTYPE
, NULL
);
7635 switch_to_section (s
);
7636 assemble_align (POINTER_SIZE
);
7637 assemble_aligned_integer (POINTER_BYTES
, symbol
);
7642 aarch64_output_casesi (rtx
*operands
)
7646 rtx diff_vec
= PATTERN (NEXT_INSN (as_a
<rtx_insn
*> (operands
[2])));
7648 static const char *const patterns
[4][2] =
7651 "ldrb\t%w3, [%0,%w1,uxtw]",
7652 "add\t%3, %4, %w3, sxtb #2"
7655 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7656 "add\t%3, %4, %w3, sxth #2"
7659 "ldr\t%w3, [%0,%w1,uxtw #2]",
7660 "add\t%3, %4, %w3, sxtw #2"
7662 /* We assume that DImode is only generated when not optimizing and
7663 that we don't really need 64-bit address offsets. That would
7664 imply an object file with 8GB of code in a single function! */
7666 "ldr\t%w3, [%0,%w1,uxtw #2]",
7667 "add\t%3, %4, %w3, sxtw #2"
7671 gcc_assert (GET_CODE (diff_vec
) == ADDR_DIFF_VEC
);
7673 scalar_int_mode mode
= as_a
<scalar_int_mode
> (GET_MODE (diff_vec
));
7674 index
= exact_log2 (GET_MODE_SIZE (mode
));
7676 gcc_assert (index
>= 0 && index
<= 3);
7678 /* Need to implement table size reduction, by chaning the code below. */
7679 output_asm_insn (patterns
[index
][0], operands
);
7680 ASM_GENERATE_INTERNAL_LABEL (label
, "Lrtx", CODE_LABEL_NUMBER (operands
[2]));
7681 snprintf (buf
, sizeof (buf
),
7682 "adr\t%%4, %s", targetm
.strip_name_encoding (label
));
7683 output_asm_insn (buf
, operands
);
7684 output_asm_insn (patterns
[index
][1], operands
);
7685 output_asm_insn ("br\t%3", operands
);
7686 assemble_label (asm_out_file
, label
);
7691 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7692 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7696 aarch64_uxt_size (int shift
, HOST_WIDE_INT mask
)
7698 if (shift
>= 0 && shift
<= 3)
7701 for (size
= 8; size
<= 32; size
*= 2)
7703 HOST_WIDE_INT bits
= ((HOST_WIDE_INT
)1U << size
) - 1;
7704 if (mask
== bits
<< shift
)
7711 /* Constant pools are per function only when PC relative
7712 literal loads are true or we are in the large memory
7716 aarch64_can_use_per_function_literal_pools_p (void)
7718 return (aarch64_pcrelative_literal_loads
7719 || aarch64_cmodel
== AARCH64_CMODEL_LARGE
);
7723 aarch64_use_blocks_for_constant_p (machine_mode
, const_rtx
)
7725 /* We can't use blocks for constants when we're using a per-function
7727 return !aarch64_can_use_per_function_literal_pools_p ();
7730 /* Select appropriate section for constants depending
7731 on where we place literal pools. */
7734 aarch64_select_rtx_section (machine_mode mode
,
7736 unsigned HOST_WIDE_INT align
)
7738 if (aarch64_can_use_per_function_literal_pools_p ())
7739 return function_section (current_function_decl
);
7741 return default_elf_select_rtx_section (mode
, x
, align
);
7744 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7746 aarch64_asm_output_pool_epilogue (FILE *f
, const char *, tree
,
7747 HOST_WIDE_INT offset
)
7749 /* When using per-function literal pools, we must ensure that any code
7750 section is aligned to the minimal instruction length, lest we get
7751 errors from the assembler re "unaligned instructions". */
7752 if ((offset
& 3) && aarch64_can_use_per_function_literal_pools_p ())
7753 ASM_OUTPUT_ALIGN (f
, 2);
7758 /* Helper function for rtx cost calculation. Strip a shift expression
7759 from X. Returns the inner operand if successful, or the original
7760 expression on failure. */
7762 aarch64_strip_shift (rtx x
)
7766 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7767 we can convert both to ROR during final output. */
7768 if ((GET_CODE (op
) == ASHIFT
7769 || GET_CODE (op
) == ASHIFTRT
7770 || GET_CODE (op
) == LSHIFTRT
7771 || GET_CODE (op
) == ROTATERT
7772 || GET_CODE (op
) == ROTATE
)
7773 && CONST_INT_P (XEXP (op
, 1)))
7774 return XEXP (op
, 0);
7776 if (GET_CODE (op
) == MULT
7777 && CONST_INT_P (XEXP (op
, 1))
7778 && ((unsigned) exact_log2 (INTVAL (XEXP (op
, 1)))) < 64)
7779 return XEXP (op
, 0);
7784 /* Helper function for rtx cost calculation. Strip an extend
7785 expression from X. Returns the inner operand if successful, or the
7786 original expression on failure. We deal with a number of possible
7787 canonicalization variations here. If STRIP_SHIFT is true, then
7788 we can strip off a shift also. */
7790 aarch64_strip_extend (rtx x
, bool strip_shift
)
7792 scalar_int_mode mode
;
7795 if (!is_a
<scalar_int_mode
> (GET_MODE (op
), &mode
))
7798 /* Zero and sign extraction of a widened value. */
7799 if ((GET_CODE (op
) == ZERO_EXTRACT
|| GET_CODE (op
) == SIGN_EXTRACT
)
7800 && XEXP (op
, 2) == const0_rtx
7801 && GET_CODE (XEXP (op
, 0)) == MULT
7802 && aarch64_is_extend_from_extract (mode
, XEXP (XEXP (op
, 0), 1),
7804 return XEXP (XEXP (op
, 0), 0);
7806 /* It can also be represented (for zero-extend) as an AND with an
7808 if (GET_CODE (op
) == AND
7809 && GET_CODE (XEXP (op
, 0)) == MULT
7810 && CONST_INT_P (XEXP (XEXP (op
, 0), 1))
7811 && CONST_INT_P (XEXP (op
, 1))
7812 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op
, 0), 1))),
7813 INTVAL (XEXP (op
, 1))) != 0)
7814 return XEXP (XEXP (op
, 0), 0);
7816 /* Now handle extended register, as this may also have an optional
7817 left shift by 1..4. */
7819 && GET_CODE (op
) == ASHIFT
7820 && CONST_INT_P (XEXP (op
, 1))
7821 && ((unsigned HOST_WIDE_INT
) INTVAL (XEXP (op
, 1))) <= 4)
7824 if (GET_CODE (op
) == ZERO_EXTEND
7825 || GET_CODE (op
) == SIGN_EXTEND
)
7834 /* Return true iff CODE is a shift supported in combination
7835 with arithmetic instructions. */
7838 aarch64_shift_p (enum rtx_code code
)
7840 return code
== ASHIFT
|| code
== ASHIFTRT
|| code
== LSHIFTRT
;
7844 /* Return true iff X is a cheap shift without a sign extend. */
7847 aarch64_cheap_mult_shift_p (rtx x
)
7854 if (!(aarch64_tune_params
.extra_tuning_flags
7855 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
))
7858 if (GET_CODE (op0
) == SIGN_EXTEND
)
7861 if (GET_CODE (x
) == ASHIFT
&& CONST_INT_P (op1
)
7862 && UINTVAL (op1
) <= 4)
7865 if (GET_CODE (x
) != MULT
|| !CONST_INT_P (op1
))
7868 HOST_WIDE_INT l2
= exact_log2 (INTVAL (op1
));
7870 if (l2
> 0 && l2
<= 4)
7876 /* Helper function for rtx cost calculation. Calculate the cost of
7877 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7878 Return the calculated cost of the expression, recursing manually in to
7879 operands where needed. */
7882 aarch64_rtx_mult_cost (rtx x
, enum rtx_code code
, int outer
, bool speed
)
7885 const struct cpu_cost_table
*extra_cost
7886 = aarch64_tune_params
.insn_extra_cost
;
7888 bool compound_p
= (outer
== PLUS
|| outer
== MINUS
);
7889 machine_mode mode
= GET_MODE (x
);
7891 gcc_checking_assert (code
== MULT
);
7896 if (VECTOR_MODE_P (mode
))
7897 mode
= GET_MODE_INNER (mode
);
7899 /* Integer multiply/fma. */
7900 if (GET_MODE_CLASS (mode
) == MODE_INT
)
7902 /* The multiply will be canonicalized as a shift, cost it as such. */
7903 if (aarch64_shift_p (GET_CODE (x
))
7904 || (CONST_INT_P (op1
)
7905 && exact_log2 (INTVAL (op1
)) > 0))
7907 bool is_extend
= GET_CODE (op0
) == ZERO_EXTEND
7908 || GET_CODE (op0
) == SIGN_EXTEND
;
7913 /* If the shift is considered cheap,
7914 then don't add any cost. */
7915 if (aarch64_cheap_mult_shift_p (x
))
7917 else if (REG_P (op1
))
7918 /* ARITH + shift-by-register. */
7919 cost
+= extra_cost
->alu
.arith_shift_reg
;
7921 /* ARITH + extended register. We don't have a cost field
7922 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7923 cost
+= extra_cost
->alu
.extend_arith
;
7925 /* ARITH + shift-by-immediate. */
7926 cost
+= extra_cost
->alu
.arith_shift
;
7929 /* LSL (immediate). */
7930 cost
+= extra_cost
->alu
.shift
;
7933 /* Strip extends as we will have costed them in the case above. */
7935 op0
= aarch64_strip_extend (op0
, true);
7937 cost
+= rtx_cost (op0
, VOIDmode
, code
, 0, speed
);
7942 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7943 compound and let the below cases handle it. After all, MNEG is a
7944 special-case alias of MSUB. */
7945 if (GET_CODE (op0
) == NEG
)
7947 op0
= XEXP (op0
, 0);
7951 /* Integer multiplies or FMAs have zero/sign extending variants. */
7952 if ((GET_CODE (op0
) == ZERO_EXTEND
7953 && GET_CODE (op1
) == ZERO_EXTEND
)
7954 || (GET_CODE (op0
) == SIGN_EXTEND
7955 && GET_CODE (op1
) == SIGN_EXTEND
))
7957 cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, MULT
, 0, speed
);
7958 cost
+= rtx_cost (XEXP (op1
, 0), VOIDmode
, MULT
, 1, speed
);
7963 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7964 cost
+= extra_cost
->mult
[0].extend_add
;
7966 /* MUL/SMULL/UMULL. */
7967 cost
+= extra_cost
->mult
[0].extend
;
7973 /* This is either an integer multiply or a MADD. In both cases
7974 we want to recurse and cost the operands. */
7975 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
7976 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
7982 cost
+= extra_cost
->mult
[mode
== DImode
].add
;
7985 cost
+= extra_cost
->mult
[mode
== DImode
].simple
;
7994 /* Floating-point FMA/FMUL can also support negations of the
7995 operands, unless the rounding mode is upward or downward in
7996 which case FNMUL is different than FMUL with operand negation. */
7997 bool neg0
= GET_CODE (op0
) == NEG
;
7998 bool neg1
= GET_CODE (op1
) == NEG
;
7999 if (compound_p
|| !flag_rounding_math
|| (neg0
&& neg1
))
8002 op0
= XEXP (op0
, 0);
8004 op1
= XEXP (op1
, 0);
8008 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8009 cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
8012 cost
+= extra_cost
->fp
[mode
== DFmode
].mult
;
8015 cost
+= rtx_cost (op0
, mode
, MULT
, 0, speed
);
8016 cost
+= rtx_cost (op1
, mode
, MULT
, 1, speed
);
8022 aarch64_address_cost (rtx x
,
8024 addr_space_t as ATTRIBUTE_UNUSED
,
8027 enum rtx_code c
= GET_CODE (x
);
8028 const struct cpu_addrcost_table
*addr_cost
= aarch64_tune_params
.addr_cost
;
8029 struct aarch64_address_info info
;
8033 if (!aarch64_classify_address (&info
, x
, mode
, false))
8035 if (GET_CODE (x
) == CONST
|| GET_CODE (x
) == SYMBOL_REF
)
8037 /* This is a CONST or SYMBOL ref which will be split
8038 in a different way depending on the code model in use.
8039 Cost it through the generic infrastructure. */
8040 int cost_symbol_ref
= rtx_cost (x
, Pmode
, MEM
, 1, speed
);
8041 /* Divide through by the cost of one instruction to
8042 bring it to the same units as the address costs. */
8043 cost_symbol_ref
/= COSTS_N_INSNS (1);
8044 /* The cost is then the cost of preparing the address,
8045 followed by an immediate (possibly 0) offset. */
8046 return cost_symbol_ref
+ addr_cost
->imm_offset
;
8050 /* This is most likely a jump table from a case
8052 return addr_cost
->register_offset
;
8058 case ADDRESS_LO_SUM
:
8059 case ADDRESS_SYMBOLIC
:
8060 case ADDRESS_REG_IMM
:
8061 cost
+= addr_cost
->imm_offset
;
8064 case ADDRESS_REG_WB
:
8065 if (c
== PRE_INC
|| c
== PRE_DEC
|| c
== PRE_MODIFY
)
8066 cost
+= addr_cost
->pre_modify
;
8067 else if (c
== POST_INC
|| c
== POST_DEC
|| c
== POST_MODIFY
)
8068 cost
+= addr_cost
->post_modify
;
8074 case ADDRESS_REG_REG
:
8075 cost
+= addr_cost
->register_offset
;
8078 case ADDRESS_REG_SXTW
:
8079 cost
+= addr_cost
->register_sextend
;
8082 case ADDRESS_REG_UXTW
:
8083 cost
+= addr_cost
->register_zextend
;
8093 /* For the sake of calculating the cost of the shifted register
8094 component, we can treat same sized modes in the same way. */
8095 if (known_eq (GET_MODE_BITSIZE (mode
), 16))
8096 cost
+= addr_cost
->addr_scale_costs
.hi
;
8097 else if (known_eq (GET_MODE_BITSIZE (mode
), 32))
8098 cost
+= addr_cost
->addr_scale_costs
.si
;
8099 else if (known_eq (GET_MODE_BITSIZE (mode
), 64))
8100 cost
+= addr_cost
->addr_scale_costs
.di
;
8102 /* We can't tell, or this is a 128-bit vector. */
8103 cost
+= addr_cost
->addr_scale_costs
.ti
;
8109 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8110 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8114 aarch64_branch_cost (bool speed_p
, bool predictable_p
)
8116 /* When optimizing for speed, use the cost of unpredictable branches. */
8117 const struct cpu_branch_cost
*branch_costs
=
8118 aarch64_tune_params
.branch_costs
;
8120 if (!speed_p
|| predictable_p
)
8121 return branch_costs
->predictable
;
8123 return branch_costs
->unpredictable
;
8126 /* Return true if the RTX X in mode MODE is a zero or sign extract
8127 usable in an ADD or SUB (extended register) instruction. */
8129 aarch64_rtx_arith_op_extract_p (rtx x
, scalar_int_mode mode
)
8131 /* Catch add with a sign extract.
8132 This is add_<optab><mode>_multp2. */
8133 if (GET_CODE (x
) == SIGN_EXTRACT
8134 || GET_CODE (x
) == ZERO_EXTRACT
)
8136 rtx op0
= XEXP (x
, 0);
8137 rtx op1
= XEXP (x
, 1);
8138 rtx op2
= XEXP (x
, 2);
8140 if (GET_CODE (op0
) == MULT
8141 && CONST_INT_P (op1
)
8142 && op2
== const0_rtx
8143 && CONST_INT_P (XEXP (op0
, 1))
8144 && aarch64_is_extend_from_extract (mode
,
8151 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8153 else if (GET_CODE (x
) == SIGN_EXTEND
8154 || GET_CODE (x
) == ZERO_EXTEND
)
8155 return REG_P (XEXP (x
, 0));
8161 aarch64_frint_unspec_p (unsigned int u
)
8179 /* Return true iff X is an rtx that will match an extr instruction
8180 i.e. as described in the *extr<mode>5_insn family of patterns.
8181 OP0 and OP1 will be set to the operands of the shifts involved
8182 on success and will be NULL_RTX otherwise. */
8185 aarch64_extr_rtx_p (rtx x
, rtx
*res_op0
, rtx
*res_op1
)
8188 scalar_int_mode mode
;
8189 if (!is_a
<scalar_int_mode
> (GET_MODE (x
), &mode
))
8192 *res_op0
= NULL_RTX
;
8193 *res_op1
= NULL_RTX
;
8195 if (GET_CODE (x
) != IOR
)
8201 if ((GET_CODE (op0
) == ASHIFT
&& GET_CODE (op1
) == LSHIFTRT
)
8202 || (GET_CODE (op1
) == ASHIFT
&& GET_CODE (op0
) == LSHIFTRT
))
8204 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8205 if (GET_CODE (op1
) == ASHIFT
)
8206 std::swap (op0
, op1
);
8208 if (!CONST_INT_P (XEXP (op0
, 1)) || !CONST_INT_P (XEXP (op1
, 1)))
8211 unsigned HOST_WIDE_INT shft_amnt_0
= UINTVAL (XEXP (op0
, 1));
8212 unsigned HOST_WIDE_INT shft_amnt_1
= UINTVAL (XEXP (op1
, 1));
8214 if (shft_amnt_0
< GET_MODE_BITSIZE (mode
)
8215 && shft_amnt_0
+ shft_amnt_1
== GET_MODE_BITSIZE (mode
))
8217 *res_op0
= XEXP (op0
, 0);
8218 *res_op1
= XEXP (op1
, 0);
8226 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8227 storing it in *COST. Result is true if the total cost of the operation
8228 has now been calculated. */
8230 aarch64_if_then_else_costs (rtx op0
, rtx op1
, rtx op2
, int *cost
, bool speed
)
8234 enum rtx_code cmpcode
;
8236 if (COMPARISON_P (op0
))
8238 inner
= XEXP (op0
, 0);
8239 comparator
= XEXP (op0
, 1);
8240 cmpcode
= GET_CODE (op0
);
8245 comparator
= const0_rtx
;
8249 if (GET_CODE (op1
) == PC
|| GET_CODE (op2
) == PC
)
8251 /* Conditional branch. */
8252 if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8256 if (cmpcode
== NE
|| cmpcode
== EQ
)
8258 if (comparator
== const0_rtx
)
8260 /* TBZ/TBNZ/CBZ/CBNZ. */
8261 if (GET_CODE (inner
) == ZERO_EXTRACT
)
8263 *cost
+= rtx_cost (XEXP (inner
, 0), VOIDmode
,
8264 ZERO_EXTRACT
, 0, speed
);
8267 *cost
+= rtx_cost (inner
, VOIDmode
, cmpcode
, 0, speed
);
8272 else if (cmpcode
== LT
|| cmpcode
== GE
)
8275 if (comparator
== const0_rtx
)
8280 else if (GET_MODE_CLASS (GET_MODE (inner
)) == MODE_CC
)
8283 if (GET_CODE (op1
) == COMPARE
)
8285 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8286 if (XEXP (op1
, 1) == const0_rtx
)
8290 machine_mode mode
= GET_MODE (XEXP (op1
, 0));
8291 const struct cpu_cost_table
*extra_cost
8292 = aarch64_tune_params
.insn_extra_cost
;
8294 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8295 *cost
+= extra_cost
->alu
.arith
;
8297 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8302 /* It's a conditional operation based on the status flags,
8303 so it must be some flavor of CSEL. */
8305 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8306 if (GET_CODE (op1
) == NEG
8307 || GET_CODE (op1
) == NOT
8308 || (GET_CODE (op1
) == PLUS
&& XEXP (op1
, 1) == const1_rtx
))
8309 op1
= XEXP (op1
, 0);
8310 else if (GET_CODE (op1
) == ZERO_EXTEND
&& GET_CODE (op2
) == ZERO_EXTEND
)
8312 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8313 op1
= XEXP (op1
, 0);
8314 op2
= XEXP (op2
, 0);
8317 *cost
+= rtx_cost (op1
, VOIDmode
, IF_THEN_ELSE
, 1, speed
);
8318 *cost
+= rtx_cost (op2
, VOIDmode
, IF_THEN_ELSE
, 2, speed
);
8322 /* We don't know what this is, cost all operands. */
8326 /* Check whether X is a bitfield operation of the form shift + extend that
8327 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8328 operand to which the bitfield operation is applied. Otherwise return
8332 aarch64_extend_bitfield_pattern_p (rtx x
)
8334 rtx_code outer_code
= GET_CODE (x
);
8335 machine_mode outer_mode
= GET_MODE (x
);
8337 if (outer_code
!= ZERO_EXTEND
&& outer_code
!= SIGN_EXTEND
8338 && outer_mode
!= SImode
&& outer_mode
!= DImode
)
8341 rtx inner
= XEXP (x
, 0);
8342 rtx_code inner_code
= GET_CODE (inner
);
8343 machine_mode inner_mode
= GET_MODE (inner
);
8349 if (CONST_INT_P (XEXP (inner
, 1))
8350 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8351 op
= XEXP (inner
, 0);
8354 if (outer_code
== ZERO_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8355 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8356 op
= XEXP (inner
, 0);
8359 if (outer_code
== SIGN_EXTEND
&& CONST_INT_P (XEXP (inner
, 1))
8360 && (inner_mode
== QImode
|| inner_mode
== HImode
))
8361 op
= XEXP (inner
, 0);
8370 /* Return true if the mask and a shift amount from an RTX of the form
8371 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8372 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8375 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode
, rtx mask
,
8378 return CONST_INT_P (mask
) && CONST_INT_P (shft_amnt
)
8379 && INTVAL (shft_amnt
) < GET_MODE_BITSIZE (mode
)
8380 && exact_log2 ((INTVAL (mask
) >> INTVAL (shft_amnt
)) + 1) >= 0
8381 && (INTVAL (mask
) & ((1 << INTVAL (shft_amnt
)) - 1)) == 0;
8384 /* Calculate the cost of calculating X, storing it in *COST. Result
8385 is true if the total cost of the operation has now been calculated. */
8387 aarch64_rtx_costs (rtx x
, machine_mode mode
, int outer ATTRIBUTE_UNUSED
,
8388 int param ATTRIBUTE_UNUSED
, int *cost
, bool speed
)
8391 const struct cpu_cost_table
*extra_cost
8392 = aarch64_tune_params
.insn_extra_cost
;
8393 int code
= GET_CODE (x
);
8394 scalar_int_mode int_mode
;
8396 /* By default, assume that everything has equivalent cost to the
8397 cheapest instruction. Any additional costs are applied as a delta
8398 above this default. */
8399 *cost
= COSTS_N_INSNS (1);
8404 /* The cost depends entirely on the operands to SET. */
8409 switch (GET_CODE (op0
))
8414 rtx address
= XEXP (op0
, 0);
8415 if (VECTOR_MODE_P (mode
))
8416 *cost
+= extra_cost
->ldst
.storev
;
8417 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8418 *cost
+= extra_cost
->ldst
.store
;
8419 else if (mode
== SFmode
)
8420 *cost
+= extra_cost
->ldst
.storef
;
8421 else if (mode
== DFmode
)
8422 *cost
+= extra_cost
->ldst
.stored
;
8425 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8429 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8433 if (! REG_P (SUBREG_REG (op0
)))
8434 *cost
+= rtx_cost (SUBREG_REG (op0
), VOIDmode
, SET
, 0, speed
);
8438 /* The cost is one per vector-register copied. */
8439 if (VECTOR_MODE_P (GET_MODE (op0
)) && REG_P (op1
))
8441 int nregs
= aarch64_hard_regno_nregs (V0_REGNUM
, GET_MODE (op0
));
8442 *cost
= COSTS_N_INSNS (nregs
);
8444 /* const0_rtx is in general free, but we will use an
8445 instruction to set a register to 0. */
8446 else if (REG_P (op1
) || op1
== const0_rtx
)
8448 /* The cost is 1 per register copied. */
8449 int nregs
= aarch64_hard_regno_nregs (R0_REGNUM
, GET_MODE (op0
));
8450 *cost
= COSTS_N_INSNS (nregs
);
8453 /* Cost is just the cost of the RHS of the set. */
8454 *cost
+= rtx_cost (op1
, mode
, SET
, 1, speed
);
8459 /* Bit-field insertion. Strip any redundant widening of
8460 the RHS to meet the width of the target. */
8461 if (GET_CODE (op1
) == SUBREG
)
8462 op1
= SUBREG_REG (op1
);
8463 if ((GET_CODE (op1
) == ZERO_EXTEND
8464 || GET_CODE (op1
) == SIGN_EXTEND
)
8465 && CONST_INT_P (XEXP (op0
, 1))
8466 && is_a
<scalar_int_mode
> (GET_MODE (XEXP (op1
, 0)), &int_mode
)
8467 && GET_MODE_BITSIZE (int_mode
) >= INTVAL (XEXP (op0
, 1)))
8468 op1
= XEXP (op1
, 0);
8470 if (CONST_INT_P (op1
))
8472 /* MOV immediate is assumed to always be cheap. */
8473 *cost
= COSTS_N_INSNS (1);
8479 *cost
+= extra_cost
->alu
.bfi
;
8480 *cost
+= rtx_cost (op1
, VOIDmode
, (enum rtx_code
) code
, 1, speed
);
8486 /* We can't make sense of this, assume default cost. */
8487 *cost
= COSTS_N_INSNS (1);
8493 /* If an instruction can incorporate a constant within the
8494 instruction, the instruction's expression avoids calling
8495 rtx_cost() on the constant. If rtx_cost() is called on a
8496 constant, then it is usually because the constant must be
8497 moved into a register by one or more instructions.
8499 The exception is constant 0, which can be expressed
8500 as XZR/WZR and is therefore free. The exception to this is
8501 if we have (set (reg) (const0_rtx)) in which case we must cost
8502 the move. However, we can catch that when we cost the SET, so
8503 we don't need to consider that here. */
8504 if (x
== const0_rtx
)
8508 /* To an approximation, building any other constant is
8509 proportionally expensive to the number of instructions
8510 required to build that constant. This is true whether we
8511 are compiling for SPEED or otherwise. */
8512 if (!is_a
<scalar_int_mode
> (mode
, &int_mode
))
8513 int_mode
= word_mode
;
8514 *cost
= COSTS_N_INSNS (aarch64_internal_mov_immediate
8515 (NULL_RTX
, x
, false, int_mode
));
8521 /* First determine number of instructions to do the move
8522 as an integer constant. */
8523 if (!aarch64_float_const_representable_p (x
)
8524 && !aarch64_can_const_movi_rtx_p (x
, mode
)
8525 && aarch64_float_const_rtx_p (x
))
8527 unsigned HOST_WIDE_INT ival
;
8528 bool succeed
= aarch64_reinterpret_float_as_int (x
, &ival
);
8529 gcc_assert (succeed
);
8531 scalar_int_mode imode
= (mode
== HFmode
8533 : int_mode_for_mode (mode
).require ());
8534 int ncost
= aarch64_internal_mov_immediate
8535 (NULL_RTX
, gen_int_mode (ival
, imode
), false, imode
);
8536 *cost
+= COSTS_N_INSNS (ncost
);
8542 /* mov[df,sf]_aarch64. */
8543 if (aarch64_float_const_representable_p (x
))
8544 /* FMOV (scalar immediate). */
8545 *cost
+= extra_cost
->fp
[mode
== DFmode
].fpconst
;
8546 else if (!aarch64_float_const_zero_rtx_p (x
))
8548 /* This will be a load from memory. */
8550 *cost
+= extra_cost
->ldst
.loadd
;
8552 *cost
+= extra_cost
->ldst
.loadf
;
8555 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8556 or MOV v0.s[0], wzr - neither of which are modeled by the
8557 cost tables. Just use the default cost. */
8567 /* For loads we want the base cost of a load, plus an
8568 approximation for the additional cost of the addressing
8570 rtx address
= XEXP (x
, 0);
8571 if (VECTOR_MODE_P (mode
))
8572 *cost
+= extra_cost
->ldst
.loadv
;
8573 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8574 *cost
+= extra_cost
->ldst
.load
;
8575 else if (mode
== SFmode
)
8576 *cost
+= extra_cost
->ldst
.loadf
;
8577 else if (mode
== DFmode
)
8578 *cost
+= extra_cost
->ldst
.loadd
;
8581 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
8590 if (VECTOR_MODE_P (mode
))
8595 *cost
+= extra_cost
->vect
.alu
;
8600 if (GET_MODE_CLASS (mode
) == MODE_INT
)
8602 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8603 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8606 *cost
+= rtx_cost (XEXP (op0
, 0), VOIDmode
, NEG
, 0, speed
);
8610 /* Cost this as SUB wzr, X. */
8611 op0
= CONST0_RTX (mode
);
8616 if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8618 /* Support (neg(fma...)) as a single instruction only if
8619 sign of zeros is unimportant. This matches the decision
8620 making in aarch64.md. */
8621 if (GET_CODE (op0
) == FMA
&& !HONOR_SIGNED_ZEROS (GET_MODE (op0
)))
8624 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8627 if (GET_CODE (op0
) == MULT
)
8630 *cost
= rtx_cost (op0
, mode
, NEG
, 0, speed
);
8635 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
8645 if (VECTOR_MODE_P (mode
))
8646 *cost
+= extra_cost
->vect
.alu
;
8648 *cost
+= extra_cost
->alu
.clz
;
8657 if (op1
== const0_rtx
8658 && GET_CODE (op0
) == AND
)
8661 mode
= GET_MODE (op0
);
8665 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
)
8667 /* TODO: A write to the CC flags possibly costs extra, this
8668 needs encoding in the cost tables. */
8670 mode
= GET_MODE (op0
);
8672 if (GET_CODE (op0
) == AND
)
8678 if (GET_CODE (op0
) == PLUS
)
8680 /* ADDS (and CMN alias). */
8685 if (GET_CODE (op0
) == MINUS
)
8692 if (GET_CODE (op0
) == ZERO_EXTRACT
&& op1
== const0_rtx
8693 && GET_MODE (x
) == CC_NZmode
&& CONST_INT_P (XEXP (op0
, 1))
8694 && CONST_INT_P (XEXP (op0
, 2)))
8696 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8697 Handle it here directly rather than going to cost_logic
8698 since we know the immediate generated for the TST is valid
8699 so we can avoid creating an intermediate rtx for it only
8700 for costing purposes. */
8702 *cost
+= extra_cost
->alu
.logical
;
8704 *cost
+= rtx_cost (XEXP (op0
, 0), GET_MODE (op0
),
8705 ZERO_EXTRACT
, 0, speed
);
8709 if (GET_CODE (op1
) == NEG
)
8713 *cost
+= extra_cost
->alu
.arith
;
8715 *cost
+= rtx_cost (op0
, mode
, COMPARE
, 0, speed
);
8716 *cost
+= rtx_cost (XEXP (op1
, 0), mode
, NEG
, 1, speed
);
8722 Compare can freely swap the order of operands, and
8723 canonicalization puts the more complex operation first.
8724 But the integer MINUS logic expects the shift/extend
8725 operation in op1. */
8727 || (GET_CODE (op0
) == SUBREG
&& REG_P (SUBREG_REG (op0
)))))
8735 if (GET_MODE_CLASS (GET_MODE (op0
)) == MODE_FLOAT
)
8739 *cost
+= extra_cost
->fp
[mode
== DFmode
].compare
;
8741 if (CONST_DOUBLE_P (op1
) && aarch64_float_const_zero_rtx_p (op1
))
8743 *cost
+= rtx_cost (op0
, VOIDmode
, COMPARE
, 0, speed
);
8744 /* FCMP supports constant 0.0 for no extra cost. */
8750 if (VECTOR_MODE_P (mode
))
8752 /* Vector compare. */
8754 *cost
+= extra_cost
->vect
.alu
;
8756 if (aarch64_float_const_zero_rtx_p (op1
))
8758 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8772 *cost
+= rtx_cost (op0
, mode
, MINUS
, 0, speed
);
8774 /* Detect valid immediates. */
8775 if ((GET_MODE_CLASS (mode
) == MODE_INT
8776 || (GET_MODE_CLASS (mode
) == MODE_CC
8777 && GET_MODE_CLASS (GET_MODE (op0
)) == MODE_INT
))
8778 && CONST_INT_P (op1
)
8779 && aarch64_uimm12_shift (INTVAL (op1
)))
8782 /* SUB(S) (immediate). */
8783 *cost
+= extra_cost
->alu
.arith
;
8787 /* Look for SUB (extended register). */
8788 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8789 && aarch64_rtx_arith_op_extract_p (op1
, int_mode
))
8792 *cost
+= extra_cost
->alu
.extend_arith
;
8794 op1
= aarch64_strip_extend (op1
, true);
8795 *cost
+= rtx_cost (op1
, VOIDmode
,
8796 (enum rtx_code
) GET_CODE (op1
), 0, speed
);
8800 rtx new_op1
= aarch64_strip_extend (op1
, false);
8802 /* Cost this as an FMA-alike operation. */
8803 if ((GET_CODE (new_op1
) == MULT
8804 || aarch64_shift_p (GET_CODE (new_op1
)))
8807 *cost
+= aarch64_rtx_mult_cost (new_op1
, MULT
,
8808 (enum rtx_code
) code
,
8813 *cost
+= rtx_cost (new_op1
, VOIDmode
, MINUS
, 1, speed
);
8817 if (VECTOR_MODE_P (mode
))
8820 *cost
+= extra_cost
->vect
.alu
;
8822 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8825 *cost
+= extra_cost
->alu
.arith
;
8827 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8830 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8844 if (GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMPARE
8845 || GET_RTX_CLASS (GET_CODE (op0
)) == RTX_COMM_COMPARE
)
8848 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, PLUS
, 0, speed
);
8849 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8853 if (GET_MODE_CLASS (mode
) == MODE_INT
8854 && ((CONST_INT_P (op1
) && aarch64_uimm12_shift (INTVAL (op1
)))
8855 || aarch64_sve_addvl_addpl_immediate (op1
, mode
)))
8857 *cost
+= rtx_cost (op0
, mode
, PLUS
, 0, speed
);
8860 /* ADD (immediate). */
8861 *cost
+= extra_cost
->alu
.arith
;
8865 *cost
+= rtx_cost (op1
, mode
, PLUS
, 1, speed
);
8867 /* Look for ADD (extended register). */
8868 if (is_a
<scalar_int_mode
> (mode
, &int_mode
)
8869 && aarch64_rtx_arith_op_extract_p (op0
, int_mode
))
8872 *cost
+= extra_cost
->alu
.extend_arith
;
8874 op0
= aarch64_strip_extend (op0
, true);
8875 *cost
+= rtx_cost (op0
, VOIDmode
,
8876 (enum rtx_code
) GET_CODE (op0
), 0, speed
);
8880 /* Strip any extend, leave shifts behind as we will
8881 cost them through mult_cost. */
8882 new_op0
= aarch64_strip_extend (op0
, false);
8884 if (GET_CODE (new_op0
) == MULT
8885 || aarch64_shift_p (GET_CODE (new_op0
)))
8887 *cost
+= aarch64_rtx_mult_cost (new_op0
, MULT
, PLUS
,
8892 *cost
+= rtx_cost (new_op0
, VOIDmode
, PLUS
, 0, speed
);
8896 if (VECTOR_MODE_P (mode
))
8899 *cost
+= extra_cost
->vect
.alu
;
8901 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
8904 *cost
+= extra_cost
->alu
.arith
;
8906 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
8909 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
8916 *cost
= COSTS_N_INSNS (1);
8920 if (VECTOR_MODE_P (mode
))
8921 *cost
+= extra_cost
->vect
.alu
;
8923 *cost
+= extra_cost
->alu
.rev
;
8928 if (aarch_rev16_p (x
))
8930 *cost
= COSTS_N_INSNS (1);
8934 if (VECTOR_MODE_P (mode
))
8935 *cost
+= extra_cost
->vect
.alu
;
8937 *cost
+= extra_cost
->alu
.rev
;
8942 if (aarch64_extr_rtx_p (x
, &op0
, &op1
))
8944 *cost
+= rtx_cost (op0
, mode
, IOR
, 0, speed
);
8945 *cost
+= rtx_cost (op1
, mode
, IOR
, 1, speed
);
8947 *cost
+= extra_cost
->alu
.shift
;
8958 if (VECTOR_MODE_P (mode
))
8961 *cost
+= extra_cost
->vect
.alu
;
8966 && GET_CODE (op0
) == MULT
8967 && CONST_INT_P (XEXP (op0
, 1))
8968 && CONST_INT_P (op1
)
8969 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0
, 1))),
8972 /* This is a UBFM/SBFM. */
8973 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, ZERO_EXTRACT
, 0, speed
);
8975 *cost
+= extra_cost
->alu
.bfx
;
8979 if (is_int_mode (mode
, &int_mode
))
8981 if (CONST_INT_P (op1
))
8983 /* We have a mask + shift version of a UBFIZ
8984 i.e. the *andim_ashift<mode>_bfiz pattern. */
8985 if (GET_CODE (op0
) == ASHIFT
8986 && aarch64_mask_and_shift_for_ubfiz_p (int_mode
, op1
,
8989 *cost
+= rtx_cost (XEXP (op0
, 0), int_mode
,
8990 (enum rtx_code
) code
, 0, speed
);
8992 *cost
+= extra_cost
->alu
.bfx
;
8996 else if (aarch64_bitmask_imm (INTVAL (op1
), int_mode
))
8998 /* We possibly get the immediate for free, this is not
9000 *cost
+= rtx_cost (op0
, int_mode
,
9001 (enum rtx_code
) code
, 0, speed
);
9003 *cost
+= extra_cost
->alu
.logical
;
9012 /* Handle ORN, EON, or BIC. */
9013 if (GET_CODE (op0
) == NOT
)
9014 op0
= XEXP (op0
, 0);
9016 new_op0
= aarch64_strip_shift (op0
);
9018 /* If we had a shift on op0 then this is a logical-shift-
9019 by-register/immediate operation. Otherwise, this is just
9020 a logical operation. */
9025 /* Shift by immediate. */
9026 if (CONST_INT_P (XEXP (op0
, 1)))
9027 *cost
+= extra_cost
->alu
.log_shift
;
9029 *cost
+= extra_cost
->alu
.log_shift_reg
;
9032 *cost
+= extra_cost
->alu
.logical
;
9035 /* In both cases we want to cost both operands. */
9036 *cost
+= rtx_cost (new_op0
, int_mode
, (enum rtx_code
) code
,
9038 *cost
+= rtx_cost (op1
, int_mode
, (enum rtx_code
) code
,
9048 op0
= aarch64_strip_shift (x
);
9050 if (VECTOR_MODE_P (mode
))
9053 *cost
+= extra_cost
->vect
.alu
;
9057 /* MVN-shifted-reg. */
9060 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9063 *cost
+= extra_cost
->alu
.log_shift
;
9067 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9068 Handle the second form here taking care that 'a' in the above can
9070 else if (GET_CODE (op0
) == XOR
)
9072 rtx newop0
= XEXP (op0
, 0);
9073 rtx newop1
= XEXP (op0
, 1);
9074 rtx op0_stripped
= aarch64_strip_shift (newop0
);
9076 *cost
+= rtx_cost (newop1
, mode
, (enum rtx_code
) code
, 1, speed
);
9077 *cost
+= rtx_cost (op0_stripped
, mode
, XOR
, 0, speed
);
9081 if (op0_stripped
!= newop0
)
9082 *cost
+= extra_cost
->alu
.log_shift
;
9084 *cost
+= extra_cost
->alu
.logical
;
9091 *cost
+= extra_cost
->alu
.logical
;
9098 /* If a value is written in SI mode, then zero extended to DI
9099 mode, the operation will in general be free as a write to
9100 a 'w' register implicitly zeroes the upper bits of an 'x'
9101 register. However, if this is
9103 (set (reg) (zero_extend (reg)))
9105 we must cost the explicit register move. */
9107 && GET_MODE (op0
) == SImode
9110 int op_cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, 0, speed
);
9112 /* If OP_COST is non-zero, then the cost of the zero extend
9113 is effectively the cost of the inner operation. Otherwise
9114 we have a MOV instruction and we take the cost from the MOV
9115 itself. This is true independently of whether we are
9116 optimizing for space or time. */
9122 else if (MEM_P (op0
))
9124 /* All loads can zero extend to any size for free. */
9125 *cost
= rtx_cost (op0
, VOIDmode
, ZERO_EXTEND
, param
, speed
);
9129 op0
= aarch64_extend_bitfield_pattern_p (x
);
9132 *cost
+= rtx_cost (op0
, mode
, ZERO_EXTEND
, 0, speed
);
9134 *cost
+= extra_cost
->alu
.bfx
;
9140 if (VECTOR_MODE_P (mode
))
9143 *cost
+= extra_cost
->vect
.alu
;
9147 /* We generate an AND instead of UXTB/UXTH. */
9148 *cost
+= extra_cost
->alu
.logical
;
9154 if (MEM_P (XEXP (x
, 0)))
9159 rtx address
= XEXP (XEXP (x
, 0), 0);
9160 *cost
+= extra_cost
->ldst
.load_sign_extend
;
9163 COSTS_N_INSNS (aarch64_address_cost (address
, mode
,
9169 op0
= aarch64_extend_bitfield_pattern_p (x
);
9172 *cost
+= rtx_cost (op0
, mode
, SIGN_EXTEND
, 0, speed
);
9174 *cost
+= extra_cost
->alu
.bfx
;
9180 if (VECTOR_MODE_P (mode
))
9181 *cost
+= extra_cost
->vect
.alu
;
9183 *cost
+= extra_cost
->alu
.extend
;
9191 if (CONST_INT_P (op1
))
9195 if (VECTOR_MODE_P (mode
))
9197 /* Vector shift (immediate). */
9198 *cost
+= extra_cost
->vect
.alu
;
9202 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9204 *cost
+= extra_cost
->alu
.shift
;
9208 /* We can incorporate zero/sign extend for free. */
9209 if (GET_CODE (op0
) == ZERO_EXTEND
9210 || GET_CODE (op0
) == SIGN_EXTEND
)
9211 op0
= XEXP (op0
, 0);
9213 *cost
+= rtx_cost (op0
, VOIDmode
, ASHIFT
, 0, speed
);
9218 if (VECTOR_MODE_P (mode
))
9221 /* Vector shift (register). */
9222 *cost
+= extra_cost
->vect
.alu
;
9228 *cost
+= extra_cost
->alu
.shift_reg
;
9230 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9231 && CONST_INT_P (XEXP (op1
, 1))
9232 && known_eq (INTVAL (XEXP (op1
, 1)),
9233 GET_MODE_BITSIZE (mode
) - 1))
9235 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9236 /* We already demanded XEXP (op1, 0) to be REG_P, so
9237 don't recurse into it. */
9241 return false; /* All arguments need to be in registers. */
9251 if (CONST_INT_P (op1
))
9253 /* ASR (immediate) and friends. */
9256 if (VECTOR_MODE_P (mode
))
9257 *cost
+= extra_cost
->vect
.alu
;
9259 *cost
+= extra_cost
->alu
.shift
;
9262 *cost
+= rtx_cost (op0
, mode
, (enum rtx_code
) code
, 0, speed
);
9267 if (VECTOR_MODE_P (mode
))
9270 /* Vector shift (register). */
9271 *cost
+= extra_cost
->vect
.alu
;
9276 /* ASR (register) and friends. */
9277 *cost
+= extra_cost
->alu
.shift_reg
;
9279 if (GET_CODE (op1
) == AND
&& REG_P (XEXP (op1
, 0))
9280 && CONST_INT_P (XEXP (op1
, 1))
9281 && known_eq (INTVAL (XEXP (op1
, 1)),
9282 GET_MODE_BITSIZE (mode
) - 1))
9284 *cost
+= rtx_cost (op0
, mode
, (rtx_code
) code
, 0, speed
);
9285 /* We already demanded XEXP (op1, 0) to be REG_P, so
9286 don't recurse into it. */
9290 return false; /* All arguments need to be in registers. */
9295 if (aarch64_cmodel
== AARCH64_CMODEL_LARGE
9296 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
)
9300 *cost
+= extra_cost
->ldst
.load
;
9302 else if (aarch64_cmodel
== AARCH64_CMODEL_SMALL
9303 || aarch64_cmodel
== AARCH64_CMODEL_SMALL_PIC
)
9305 /* ADRP, followed by ADD. */
9306 *cost
+= COSTS_N_INSNS (1);
9308 *cost
+= 2 * extra_cost
->alu
.arith
;
9310 else if (aarch64_cmodel
== AARCH64_CMODEL_TINY
9311 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
9315 *cost
+= extra_cost
->alu
.arith
;
9320 /* One extra load instruction, after accessing the GOT. */
9321 *cost
+= COSTS_N_INSNS (1);
9323 *cost
+= extra_cost
->ldst
.load
;
9329 /* ADRP/ADD (immediate). */
9331 *cost
+= extra_cost
->alu
.arith
;
9339 if (VECTOR_MODE_P (mode
))
9340 *cost
+= extra_cost
->vect
.alu
;
9342 *cost
+= extra_cost
->alu
.bfx
;
9345 /* We can trust that the immediates used will be correct (there
9346 are no by-register forms), so we need only cost op0. */
9347 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9351 *cost
+= aarch64_rtx_mult_cost (x
, MULT
, 0, speed
);
9352 /* aarch64_rtx_mult_cost always handles recursion to its
9357 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9358 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9359 an unconditional negate. This case should only ever be reached through
9360 the set_smod_pow2_cheap check in expmed.c. */
9361 if (CONST_INT_P (XEXP (x
, 1))
9362 && exact_log2 (INTVAL (XEXP (x
, 1))) > 0
9363 && (mode
== SImode
|| mode
== DImode
))
9365 /* We expand to 4 instructions. Reset the baseline. */
9366 *cost
= COSTS_N_INSNS (4);
9369 *cost
+= 2 * extra_cost
->alu
.logical
9370 + 2 * extra_cost
->alu
.arith
;
9379 /* Slighly prefer UMOD over SMOD. */
9380 if (VECTOR_MODE_P (mode
))
9381 *cost
+= extra_cost
->vect
.alu
;
9382 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9383 *cost
+= (extra_cost
->mult
[mode
== DImode
].add
9384 + extra_cost
->mult
[mode
== DImode
].idiv
9385 + (code
== MOD
? 1 : 0));
9387 return false; /* All arguments need to be in registers. */
9394 if (VECTOR_MODE_P (mode
))
9395 *cost
+= extra_cost
->vect
.alu
;
9396 else if (GET_MODE_CLASS (mode
) == MODE_INT
)
9397 /* There is no integer SQRT, so only DIV and UDIV can get
9399 *cost
+= (extra_cost
->mult
[mode
== DImode
].idiv
9400 /* Slighly prefer UDIV over SDIV. */
9401 + (code
== DIV
? 1 : 0));
9403 *cost
+= extra_cost
->fp
[mode
== DFmode
].div
;
9405 return false; /* All arguments need to be in registers. */
9408 return aarch64_if_then_else_costs (XEXP (x
, 0), XEXP (x
, 1),
9409 XEXP (x
, 2), cost
, speed
);
9422 return false; /* All arguments must be in registers. */
9431 if (VECTOR_MODE_P (mode
))
9432 *cost
+= extra_cost
->vect
.alu
;
9434 *cost
+= extra_cost
->fp
[mode
== DFmode
].fma
;
9437 /* FMSUB, FNMADD, and FNMSUB are free. */
9438 if (GET_CODE (op0
) == NEG
)
9439 op0
= XEXP (op0
, 0);
9441 if (GET_CODE (op2
) == NEG
)
9442 op2
= XEXP (op2
, 0);
9444 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9445 and the by-element operand as operand 0. */
9446 if (GET_CODE (op1
) == NEG
)
9447 op1
= XEXP (op1
, 0);
9449 /* Catch vector-by-element operations. The by-element operand can
9450 either be (vec_duplicate (vec_select (x))) or just
9451 (vec_select (x)), depending on whether we are multiplying by
9452 a vector or a scalar.
9454 Canonicalization is not very good in these cases, FMA4 will put the
9455 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9456 if (GET_CODE (op0
) == VEC_DUPLICATE
)
9457 op0
= XEXP (op0
, 0);
9458 else if (GET_CODE (op1
) == VEC_DUPLICATE
)
9459 op1
= XEXP (op1
, 0);
9461 if (GET_CODE (op0
) == VEC_SELECT
)
9462 op0
= XEXP (op0
, 0);
9463 else if (GET_CODE (op1
) == VEC_SELECT
)
9464 op1
= XEXP (op1
, 0);
9466 /* If the remaining parameters are not registers,
9467 get the cost to put them into registers. */
9468 *cost
+= rtx_cost (op0
, mode
, FMA
, 0, speed
);
9469 *cost
+= rtx_cost (op1
, mode
, FMA
, 1, speed
);
9470 *cost
+= rtx_cost (op2
, mode
, FMA
, 2, speed
);
9474 case UNSIGNED_FLOAT
:
9476 *cost
+= extra_cost
->fp
[mode
== DFmode
].fromint
;
9482 if (VECTOR_MODE_P (mode
))
9484 /*Vector truncate. */
9485 *cost
+= extra_cost
->vect
.alu
;
9488 *cost
+= extra_cost
->fp
[mode
== DFmode
].widen
;
9492 case FLOAT_TRUNCATE
:
9495 if (VECTOR_MODE_P (mode
))
9497 /*Vector conversion. */
9498 *cost
+= extra_cost
->vect
.alu
;
9501 *cost
+= extra_cost
->fp
[mode
== DFmode
].narrow
;
9508 /* Strip the rounding part. They will all be implemented
9509 by the fcvt* family of instructions anyway. */
9510 if (GET_CODE (x
) == UNSPEC
)
9512 unsigned int uns_code
= XINT (x
, 1);
9514 if (uns_code
== UNSPEC_FRINTA
9515 || uns_code
== UNSPEC_FRINTM
9516 || uns_code
== UNSPEC_FRINTN
9517 || uns_code
== UNSPEC_FRINTP
9518 || uns_code
== UNSPEC_FRINTZ
)
9519 x
= XVECEXP (x
, 0, 0);
9524 if (VECTOR_MODE_P (mode
))
9525 *cost
+= extra_cost
->vect
.alu
;
9527 *cost
+= extra_cost
->fp
[GET_MODE (x
) == DFmode
].toint
;
9530 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9531 fixed-point fcvt. */
9532 if (GET_CODE (x
) == MULT
9533 && ((VECTOR_MODE_P (mode
)
9534 && aarch64_vec_fpconst_pow_of_2 (XEXP (x
, 1)) > 0)
9535 || aarch64_fpconst_pow_of_2 (XEXP (x
, 1)) > 0))
9537 *cost
+= rtx_cost (XEXP (x
, 0), VOIDmode
, (rtx_code
) code
,
9542 *cost
+= rtx_cost (x
, VOIDmode
, (enum rtx_code
) code
, 0, speed
);
9546 if (VECTOR_MODE_P (mode
))
9550 *cost
+= extra_cost
->vect
.alu
;
9552 else if (GET_MODE_CLASS (mode
) == MODE_FLOAT
)
9556 /* FABD, which is analogous to FADD. */
9557 if (GET_CODE (op0
) == MINUS
)
9559 *cost
+= rtx_cost (XEXP (op0
, 0), mode
, MINUS
, 0, speed
);
9560 *cost
+= rtx_cost (XEXP (op0
, 1), mode
, MINUS
, 1, speed
);
9562 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9566 /* Simple FABS is analogous to FNEG. */
9568 *cost
+= extra_cost
->fp
[mode
== DFmode
].neg
;
9572 /* Integer ABS will either be split to
9573 two arithmetic instructions, or will be an ABS
9574 (scalar), which we don't model. */
9575 *cost
= COSTS_N_INSNS (2);
9577 *cost
+= 2 * extra_cost
->alu
.arith
;
9585 if (VECTOR_MODE_P (mode
))
9586 *cost
+= extra_cost
->vect
.alu
;
9589 /* FMAXNM/FMINNM/FMAX/FMIN.
9590 TODO: This may not be accurate for all implementations, but
9591 we do not model this in the cost tables. */
9592 *cost
+= extra_cost
->fp
[mode
== DFmode
].addsub
;
9598 /* The floating point round to integer frint* instructions. */
9599 if (aarch64_frint_unspec_p (XINT (x
, 1)))
9602 *cost
+= extra_cost
->fp
[mode
== DFmode
].roundint
;
9607 if (XINT (x
, 1) == UNSPEC_RBIT
)
9610 *cost
+= extra_cost
->alu
.rev
;
9618 /* Decompose <su>muldi3_highpart. */
9619 if (/* (truncate:DI */
9622 && GET_MODE (XEXP (x
, 0)) == TImode
9623 && GET_CODE (XEXP (x
, 0)) == LSHIFTRT
9625 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == MULT
9626 /* (ANY_EXTEND:TI (reg:DI))
9627 (ANY_EXTEND:TI (reg:DI))) */
9628 && ((GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == ZERO_EXTEND
9629 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == ZERO_EXTEND
)
9630 || (GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 0)) == SIGN_EXTEND
9631 && GET_CODE (XEXP (XEXP (XEXP (x
, 0), 0), 1)) == SIGN_EXTEND
))
9632 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0)) == DImode
9633 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0)) == DImode
9634 /* (const_int 64) */
9635 && CONST_INT_P (XEXP (XEXP (x
, 0), 1))
9636 && UINTVAL (XEXP (XEXP (x
, 0), 1)) == 64)
9640 *cost
+= extra_cost
->mult
[mode
== DImode
].extend
;
9641 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 0), 0),
9642 mode
, MULT
, 0, speed
);
9643 *cost
+= rtx_cost (XEXP (XEXP (XEXP (XEXP (x
, 0), 0), 1), 0),
9644 mode
, MULT
, 1, speed
);
9654 && flag_aarch64_verbose_cost
)
9656 "\nFailed to cost RTX. Assuming default cost.\n");
9661 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9662 calculated for X. This cost is stored in *COST. Returns true
9663 if the total cost of X was calculated. */
9665 aarch64_rtx_costs_wrapper (rtx x
, machine_mode mode
, int outer
,
9666 int param
, int *cost
, bool speed
)
9668 bool result
= aarch64_rtx_costs (x
, mode
, outer
, param
, cost
, speed
);
9671 && flag_aarch64_verbose_cost
)
9673 print_rtl_single (dump_file
, x
);
9674 fprintf (dump_file
, "\n%s cost: %d (%s)\n",
9675 speed
? "Hot" : "Cold",
9676 *cost
, result
? "final" : "partial");
9683 aarch64_register_move_cost (machine_mode mode
,
9684 reg_class_t from_i
, reg_class_t to_i
)
9686 enum reg_class from
= (enum reg_class
) from_i
;
9687 enum reg_class to
= (enum reg_class
) to_i
;
9688 const struct cpu_regmove_cost
*regmove_cost
9689 = aarch64_tune_params
.regmove_cost
;
9691 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9692 if (to
== TAILCALL_ADDR_REGS
|| to
== POINTER_REGS
)
9695 if (from
== TAILCALL_ADDR_REGS
|| from
== POINTER_REGS
)
9696 from
= GENERAL_REGS
;
9698 /* Moving between GPR and stack cost is the same as GP2GP. */
9699 if ((from
== GENERAL_REGS
&& to
== STACK_REG
)
9700 || (to
== GENERAL_REGS
&& from
== STACK_REG
))
9701 return regmove_cost
->GP2GP
;
9703 /* To/From the stack register, we move via the gprs. */
9704 if (to
== STACK_REG
|| from
== STACK_REG
)
9705 return aarch64_register_move_cost (mode
, from
, GENERAL_REGS
)
9706 + aarch64_register_move_cost (mode
, GENERAL_REGS
, to
);
9708 if (known_eq (GET_MODE_SIZE (mode
), 16))
9710 /* 128-bit operations on general registers require 2 instructions. */
9711 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9712 return regmove_cost
->GP2GP
* 2;
9713 else if (from
== GENERAL_REGS
)
9714 return regmove_cost
->GP2FP
* 2;
9715 else if (to
== GENERAL_REGS
)
9716 return regmove_cost
->FP2GP
* 2;
9718 /* When AdvSIMD instructions are disabled it is not possible to move
9719 a 128-bit value directly between Q registers. This is handled in
9720 secondary reload. A general register is used as a scratch to move
9721 the upper DI value and the lower DI value is moved directly,
9722 hence the cost is the sum of three moves. */
9724 return regmove_cost
->GP2FP
+ regmove_cost
->FP2GP
+ regmove_cost
->FP2FP
;
9726 return regmove_cost
->FP2FP
;
9729 if (from
== GENERAL_REGS
&& to
== GENERAL_REGS
)
9730 return regmove_cost
->GP2GP
;
9731 else if (from
== GENERAL_REGS
)
9732 return regmove_cost
->GP2FP
;
9733 else if (to
== GENERAL_REGS
)
9734 return regmove_cost
->FP2GP
;
9736 return regmove_cost
->FP2FP
;
9740 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED
,
9741 reg_class_t rclass ATTRIBUTE_UNUSED
,
9742 bool in ATTRIBUTE_UNUSED
)
9744 return aarch64_tune_params
.memmov_cost
;
9747 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9748 to optimize 1.0/sqrt. */
9751 use_rsqrt_p (machine_mode mode
)
9753 return (!flag_trapping_math
9754 && flag_unsafe_math_optimizations
9755 && ((aarch64_tune_params
.approx_modes
->recip_sqrt
9756 & AARCH64_APPROX_MODE (mode
))
9757 || flag_mrecip_low_precision_sqrt
));
9760 /* Function to decide when to use the approximate reciprocal square root
9764 aarch64_builtin_reciprocal (tree fndecl
)
9766 machine_mode mode
= TYPE_MODE (TREE_TYPE (fndecl
));
9768 if (!use_rsqrt_p (mode
))
9770 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl
));
9773 /* Emit instruction sequence to compute either the approximate square root
9774 or its approximate reciprocal, depending on the flag RECP, and return
9775 whether the sequence was emitted or not. */
9778 aarch64_emit_approx_sqrt (rtx dst
, rtx src
, bool recp
)
9780 machine_mode mode
= GET_MODE (dst
);
9782 if (GET_MODE_INNER (mode
) == HFmode
)
9790 if (!(flag_mlow_precision_sqrt
9791 || (aarch64_tune_params
.approx_modes
->sqrt
9792 & AARCH64_APPROX_MODE (mode
))))
9795 if (flag_finite_math_only
9796 || flag_trapping_math
9797 || !flag_unsafe_math_optimizations
9798 || optimize_function_for_size_p (cfun
))
9802 /* Caller assumes we cannot fail. */
9803 gcc_assert (use_rsqrt_p (mode
));
9805 machine_mode mmsk
= mode_for_int_vector (mode
).require ();
9806 rtx xmsk
= gen_reg_rtx (mmsk
);
9808 /* When calculating the approximate square root, compare the
9809 argument with 0.0 and create a mask. */
9810 emit_insn (gen_rtx_SET (xmsk
,
9812 gen_rtx_EQ (mmsk
, src
,
9813 CONST0_RTX (mode
)))));
9815 /* Estimate the approximate reciprocal square root. */
9816 rtx xdst
= gen_reg_rtx (mode
);
9817 emit_insn (gen_aarch64_rsqrte (mode
, xdst
, src
));
9819 /* Iterate over the series twice for SF and thrice for DF. */
9820 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9822 /* Optionally iterate over the series once less for faster performance
9823 while sacrificing the accuracy. */
9824 if ((recp
&& flag_mrecip_low_precision_sqrt
)
9825 || (!recp
&& flag_mlow_precision_sqrt
))
9828 /* Iterate over the series to calculate the approximate reciprocal square
9830 rtx x1
= gen_reg_rtx (mode
);
9831 while (iterations
--)
9833 rtx x2
= gen_reg_rtx (mode
);
9834 emit_set_insn (x2
, gen_rtx_MULT (mode
, xdst
, xdst
));
9836 emit_insn (gen_aarch64_rsqrts (mode
, x1
, src
, x2
));
9839 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, x1
));
9844 /* Qualify the approximate reciprocal square root when the argument is
9845 0.0 by squashing the intermediary result to 0.0. */
9846 rtx xtmp
= gen_reg_rtx (mmsk
);
9847 emit_set_insn (xtmp
, gen_rtx_AND (mmsk
, gen_rtx_NOT (mmsk
, xmsk
),
9848 gen_rtx_SUBREG (mmsk
, xdst
, 0)));
9849 emit_move_insn (xdst
, gen_rtx_SUBREG (mode
, xtmp
, 0));
9851 /* Calculate the approximate square root. */
9852 emit_set_insn (xdst
, gen_rtx_MULT (mode
, xdst
, src
));
9855 /* Finalize the approximation. */
9856 emit_set_insn (dst
, gen_rtx_MULT (mode
, xdst
, x1
));
9861 /* Emit the instruction sequence to compute the approximation for the division
9862 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9865 aarch64_emit_approx_div (rtx quo
, rtx num
, rtx den
)
9867 machine_mode mode
= GET_MODE (quo
);
9869 if (GET_MODE_INNER (mode
) == HFmode
)
9872 bool use_approx_division_p
= (flag_mlow_precision_div
9873 || (aarch64_tune_params
.approx_modes
->division
9874 & AARCH64_APPROX_MODE (mode
)));
9876 if (!flag_finite_math_only
9877 || flag_trapping_math
9878 || !flag_unsafe_math_optimizations
9879 || optimize_function_for_size_p (cfun
)
9880 || !use_approx_division_p
)
9883 if (!TARGET_SIMD
&& VECTOR_MODE_P (mode
))
9886 /* Estimate the approximate reciprocal. */
9887 rtx xrcp
= gen_reg_rtx (mode
);
9888 emit_insn (gen_aarch64_frecpe (mode
, xrcp
, den
));
9890 /* Iterate over the series twice for SF and thrice for DF. */
9891 int iterations
= (GET_MODE_INNER (mode
) == DFmode
) ? 3 : 2;
9893 /* Optionally iterate over the series once less for faster performance,
9894 while sacrificing the accuracy. */
9895 if (flag_mlow_precision_div
)
9898 /* Iterate over the series to calculate the approximate reciprocal. */
9899 rtx xtmp
= gen_reg_rtx (mode
);
9900 while (iterations
--)
9902 emit_insn (gen_aarch64_frecps (mode
, xtmp
, xrcp
, den
));
9905 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
9908 if (num
!= CONST1_RTX (mode
))
9910 /* As the approximate reciprocal of DEN is already calculated, only
9911 calculate the approximate division when NUM is not 1.0. */
9912 rtx xnum
= force_reg (mode
, num
);
9913 emit_set_insn (xrcp
, gen_rtx_MULT (mode
, xrcp
, xnum
));
9916 /* Finalize the approximation. */
9917 emit_set_insn (quo
, gen_rtx_MULT (mode
, xrcp
, xtmp
));
9921 /* Return the number of instructions that can be issued per cycle. */
9923 aarch64_sched_issue_rate (void)
9925 return aarch64_tune_params
.issue_rate
;
9929 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9931 int issue_rate
= aarch64_sched_issue_rate ();
9933 return issue_rate
> 1 && !sched_fusion
? issue_rate
: 0;
9937 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9938 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9939 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9942 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn
*insn
,
9945 return autopref_multipass_dfa_lookahead_guard (insn
, ready_index
);
9949 /* Vectorizer cost model target hooks. */
9951 /* Implement targetm.vectorize.builtin_vectorization_cost. */
9953 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost
,
9955 int misalign ATTRIBUTE_UNUSED
)
9958 const cpu_vector_cost
*costs
= aarch64_tune_params
.vec_costs
;
9961 if (vectype
!= NULL
)
9962 fp
= FLOAT_TYPE_P (vectype
);
9964 switch (type_of_cost
)
9967 return fp
? costs
->scalar_fp_stmt_cost
: costs
->scalar_int_stmt_cost
;
9970 return costs
->scalar_load_cost
;
9973 return costs
->scalar_store_cost
;
9976 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
9979 return costs
->vec_align_load_cost
;
9982 return costs
->vec_store_cost
;
9985 return costs
->vec_to_scalar_cost
;
9988 return costs
->scalar_to_vec_cost
;
9990 case unaligned_load
:
9991 case vector_gather_load
:
9992 return costs
->vec_unalign_load_cost
;
9994 case unaligned_store
:
9995 case vector_scatter_store
:
9996 return costs
->vec_unalign_store_cost
;
9998 case cond_branch_taken
:
9999 return costs
->cond_taken_branch_cost
;
10001 case cond_branch_not_taken
:
10002 return costs
->cond_not_taken_branch_cost
;
10005 return costs
->vec_permute_cost
;
10007 case vec_promote_demote
:
10008 return fp
? costs
->vec_fp_stmt_cost
: costs
->vec_int_stmt_cost
;
10010 case vec_construct
:
10011 elements
= estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype
));
10012 return elements
/ 2 + 1;
10015 gcc_unreachable ();
10019 /* Implement targetm.vectorize.add_stmt_cost. */
10021 aarch64_add_stmt_cost (void *data
, int count
, enum vect_cost_for_stmt kind
,
10022 struct _stmt_vec_info
*stmt_info
, int misalign
,
10023 enum vect_cost_model_location where
)
10025 unsigned *cost
= (unsigned *) data
;
10026 unsigned retval
= 0;
10028 if (flag_vect_cost_model
)
10030 tree vectype
= stmt_info
? stmt_vectype (stmt_info
) : NULL_TREE
;
10032 aarch64_builtin_vectorization_cost (kind
, vectype
, misalign
);
10034 /* Statements in an inner loop relative to the loop being
10035 vectorized are weighted more heavily. The value here is
10036 arbitrary and could potentially be improved with analysis. */
10037 if (where
== vect_body
&& stmt_info
&& stmt_in_inner_loop_p (stmt_info
))
10038 count
*= 50; /* FIXME */
10040 retval
= (unsigned) (count
* stmt_cost
);
10041 cost
[where
] += retval
;
10047 static void initialize_aarch64_code_model (struct gcc_options
*);
10049 /* Parse the TO_PARSE string and put the architecture struct that it
10050 selects into RES and the architectural features into ISA_FLAGS.
10051 Return an aarch64_parse_opt_result describing the parse result.
10052 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10054 static enum aarch64_parse_opt_result
10055 aarch64_parse_arch (const char *to_parse
, const struct processor
**res
,
10056 unsigned long *isa_flags
)
10059 const struct processor
*arch
;
10060 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10063 strcpy (str
, to_parse
);
10065 ext
= strchr (str
, '+');
10070 len
= strlen (str
);
10073 return AARCH64_PARSE_MISSING_ARG
;
10076 /* Loop through the list of supported ARCHes to find a match. */
10077 for (arch
= all_architectures
; arch
->name
!= NULL
; arch
++)
10079 if (strlen (arch
->name
) == len
&& strncmp (arch
->name
, str
, len
) == 0)
10081 unsigned long isa_temp
= arch
->flags
;
10085 /* TO_PARSE string contains at least one extension. */
10086 enum aarch64_parse_opt_result ext_res
10087 = aarch64_parse_extension (ext
, &isa_temp
);
10089 if (ext_res
!= AARCH64_PARSE_OK
)
10092 /* Extension parsing was successful. Confirm the result
10093 arch and ISA flags. */
10095 *isa_flags
= isa_temp
;
10096 return AARCH64_PARSE_OK
;
10100 /* ARCH name not found in list. */
10101 return AARCH64_PARSE_INVALID_ARG
;
10104 /* Parse the TO_PARSE string and put the result tuning in RES and the
10105 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10106 describing the parse result. If there is an error parsing, RES and
10107 ISA_FLAGS are left unchanged. */
10109 static enum aarch64_parse_opt_result
10110 aarch64_parse_cpu (const char *to_parse
, const struct processor
**res
,
10111 unsigned long *isa_flags
)
10114 const struct processor
*cpu
;
10115 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10118 strcpy (str
, to_parse
);
10120 ext
= strchr (str
, '+');
10125 len
= strlen (str
);
10128 return AARCH64_PARSE_MISSING_ARG
;
10131 /* Loop through the list of supported CPUs to find a match. */
10132 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10134 if (strlen (cpu
->name
) == len
&& strncmp (cpu
->name
, str
, len
) == 0)
10136 unsigned long isa_temp
= cpu
->flags
;
10141 /* TO_PARSE string contains at least one extension. */
10142 enum aarch64_parse_opt_result ext_res
10143 = aarch64_parse_extension (ext
, &isa_temp
);
10145 if (ext_res
!= AARCH64_PARSE_OK
)
10148 /* Extension parsing was successfull. Confirm the result
10149 cpu and ISA flags. */
10151 *isa_flags
= isa_temp
;
10152 return AARCH64_PARSE_OK
;
10156 /* CPU name not found in list. */
10157 return AARCH64_PARSE_INVALID_ARG
;
10160 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10161 Return an aarch64_parse_opt_result describing the parse result.
10162 If the parsing fails the RES does not change. */
10164 static enum aarch64_parse_opt_result
10165 aarch64_parse_tune (const char *to_parse
, const struct processor
**res
)
10167 const struct processor
*cpu
;
10168 char *str
= (char *) alloca (strlen (to_parse
) + 1);
10170 strcpy (str
, to_parse
);
10172 /* Loop through the list of supported CPUs to find a match. */
10173 for (cpu
= all_cores
; cpu
->name
!= NULL
; cpu
++)
10175 if (strcmp (cpu
->name
, str
) == 0)
10178 return AARCH64_PARSE_OK
;
10182 /* CPU name not found in list. */
10183 return AARCH64_PARSE_INVALID_ARG
;
10186 /* Parse TOKEN, which has length LENGTH to see if it is an option
10187 described in FLAG. If it is, return the index bit for that fusion type.
10188 If not, error (printing OPTION_NAME) and return zero. */
10190 static unsigned int
10191 aarch64_parse_one_option_token (const char *token
,
10193 const struct aarch64_flag_desc
*flag
,
10194 const char *option_name
)
10196 for (; flag
->name
!= NULL
; flag
++)
10198 if (length
== strlen (flag
->name
)
10199 && !strncmp (flag
->name
, token
, length
))
10203 error ("unknown flag passed in -moverride=%s (%s)", option_name
, token
);
10207 /* Parse OPTION which is a comma-separated list of flags to enable.
10208 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10209 default state we inherit from the CPU tuning structures. OPTION_NAME
10210 gives the top-level option we are parsing in the -moverride string,
10211 for use in error messages. */
10213 static unsigned int
10214 aarch64_parse_boolean_options (const char *option
,
10215 const struct aarch64_flag_desc
*flags
,
10216 unsigned int initial_state
,
10217 const char *option_name
)
10219 const char separator
= '.';
10220 const char* specs
= option
;
10221 const char* ntoken
= option
;
10222 unsigned int found_flags
= initial_state
;
10224 while ((ntoken
= strchr (specs
, separator
)))
10226 size_t token_length
= ntoken
- specs
;
10227 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10231 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10232 in the token stream, reset the supported operations. So:
10234 adrp+add.cmp+branch.none.adrp+add
10236 would have the result of turning on only adrp+add fusion. */
10240 found_flags
|= token_ops
;
10244 /* We ended with a comma, print something. */
10247 error ("%s string ill-formed\n", option_name
);
10251 /* We still have one more token to parse. */
10252 size_t token_length
= strlen (specs
);
10253 unsigned token_ops
= aarch64_parse_one_option_token (specs
,
10260 found_flags
|= token_ops
;
10261 return found_flags
;
10264 /* Support for overriding instruction fusion. */
10267 aarch64_parse_fuse_string (const char *fuse_string
,
10268 struct tune_params
*tune
)
10270 tune
->fusible_ops
= aarch64_parse_boolean_options (fuse_string
,
10271 aarch64_fusible_pairs
,
10276 /* Support for overriding other tuning flags. */
10279 aarch64_parse_tune_string (const char *tune_string
,
10280 struct tune_params
*tune
)
10282 tune
->extra_tuning_flags
10283 = aarch64_parse_boolean_options (tune_string
,
10284 aarch64_tuning_flags
,
10285 tune
->extra_tuning_flags
,
10289 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10290 we understand. If it is, extract the option string and handoff to
10291 the appropriate function. */
10294 aarch64_parse_one_override_token (const char* token
,
10296 struct tune_params
*tune
)
10298 const struct aarch64_tuning_override_function
*fn
10299 = aarch64_tuning_override_functions
;
10301 const char *option_part
= strchr (token
, '=');
10304 error ("tuning string missing in option (%s)", token
);
10308 /* Get the length of the option name. */
10309 length
= option_part
- token
;
10310 /* Skip the '=' to get to the option string. */
10313 for (; fn
->name
!= NULL
; fn
++)
10315 if (!strncmp (fn
->name
, token
, length
))
10317 fn
->parse_override (option_part
, tune
);
10322 error ("unknown tuning option (%s)",token
);
10326 /* A checking mechanism for the implementation of the tls size. */
10329 initialize_aarch64_tls_size (struct gcc_options
*opts
)
10331 if (aarch64_tls_size
== 0)
10332 aarch64_tls_size
= 24;
10334 switch (opts
->x_aarch64_cmodel_var
)
10336 case AARCH64_CMODEL_TINY
:
10337 /* Both the default and maximum TLS size allowed under tiny is 1M which
10338 needs two instructions to address, so we clamp the size to 24. */
10339 if (aarch64_tls_size
> 24)
10340 aarch64_tls_size
= 24;
10342 case AARCH64_CMODEL_SMALL
:
10343 /* The maximum TLS size allowed under small is 4G. */
10344 if (aarch64_tls_size
> 32)
10345 aarch64_tls_size
= 32;
10347 case AARCH64_CMODEL_LARGE
:
10348 /* The maximum TLS size allowed under large is 16E.
10349 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10350 if (aarch64_tls_size
> 48)
10351 aarch64_tls_size
= 48;
10354 gcc_unreachable ();
10360 /* Parse STRING looking for options in the format:
10361 string :: option:string
10362 option :: name=substring
10364 substring :: defined by option. */
10367 aarch64_parse_override_string (const char* input_string
,
10368 struct tune_params
* tune
)
10370 const char separator
= ':';
10371 size_t string_length
= strlen (input_string
) + 1;
10372 char *string_root
= (char *) xmalloc (sizeof (*string_root
) * string_length
);
10373 char *string
= string_root
;
10374 strncpy (string
, input_string
, string_length
);
10375 string
[string_length
- 1] = '\0';
10377 char* ntoken
= string
;
10379 while ((ntoken
= strchr (string
, separator
)))
10381 size_t token_length
= ntoken
- string
;
10382 /* Make this substring look like a string. */
10384 aarch64_parse_one_override_token (string
, token_length
, tune
);
10388 /* One last option to parse. */
10389 aarch64_parse_one_override_token (string
, strlen (string
), tune
);
10390 free (string_root
);
10395 aarch64_override_options_after_change_1 (struct gcc_options
*opts
)
10397 /* PR 70044: We have to be careful about being called multiple times for the
10398 same function. This means all changes should be repeatable. */
10400 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10401 Disable the frame pointer flag so the mid-end will not use a frame
10402 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10403 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10404 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10405 aarch64_use_frame_pointer
= opts
->x_flag_omit_frame_pointer
!= 1;
10406 if (opts
->x_flag_omit_frame_pointer
== 0)
10407 opts
->x_flag_omit_frame_pointer
= 2;
10409 /* If not optimizing for size, set the default
10410 alignment to what the target wants. */
10411 if (!opts
->x_optimize_size
)
10413 if (opts
->x_flag_align_loops
&& !opts
->x_str_align_loops
)
10414 opts
->x_str_align_loops
= aarch64_tune_params
.loop_align
;
10415 if (opts
->x_flag_align_jumps
&& !opts
->x_str_align_jumps
)
10416 opts
->x_str_align_jumps
= aarch64_tune_params
.jump_align
;
10417 if (opts
->x_flag_align_functions
&& !opts
->x_str_align_functions
)
10418 opts
->x_str_align_functions
= aarch64_tune_params
.function_align
;
10421 /* We default to no pc-relative literal loads. */
10423 aarch64_pcrelative_literal_loads
= false;
10425 /* If -mpc-relative-literal-loads is set on the command line, this
10426 implies that the user asked for PC relative literal loads. */
10427 if (opts
->x_pcrelative_literal_loads
== 1)
10428 aarch64_pcrelative_literal_loads
= true;
10430 /* In the tiny memory model it makes no sense to disallow PC relative
10431 literal pool loads. */
10432 if (aarch64_cmodel
== AARCH64_CMODEL_TINY
10433 || aarch64_cmodel
== AARCH64_CMODEL_TINY_PIC
)
10434 aarch64_pcrelative_literal_loads
= true;
10436 /* When enabling the lower precision Newton series for the square root, also
10437 enable it for the reciprocal square root, since the latter is an
10438 intermediary step for the former. */
10439 if (flag_mlow_precision_sqrt
)
10440 flag_mrecip_low_precision_sqrt
= true;
10443 /* 'Unpack' up the internal tuning structs and update the options
10444 in OPTS. The caller must have set up selected_tune and selected_arch
10445 as all the other target-specific codegen decisions are
10446 derived from them. */
10449 aarch64_override_options_internal (struct gcc_options
*opts
)
10451 aarch64_tune_flags
= selected_tune
->flags
;
10452 aarch64_tune
= selected_tune
->sched_core
;
10453 /* Make a copy of the tuning parameters attached to the core, which
10454 we may later overwrite. */
10455 aarch64_tune_params
= *(selected_tune
->tune
);
10456 aarch64_architecture_version
= selected_arch
->architecture_version
;
10458 if (opts
->x_aarch64_override_tune_string
)
10459 aarch64_parse_override_string (opts
->x_aarch64_override_tune_string
,
10460 &aarch64_tune_params
);
10462 /* This target defaults to strict volatile bitfields. */
10463 if (opts
->x_flag_strict_volatile_bitfields
< 0 && abi_version_at_least (2))
10464 opts
->x_flag_strict_volatile_bitfields
= 1;
10466 initialize_aarch64_code_model (opts
);
10467 initialize_aarch64_tls_size (opts
);
10469 int queue_depth
= 0;
10470 switch (aarch64_tune_params
.autoprefetcher_model
)
10472 case tune_params::AUTOPREFETCHER_OFF
:
10475 case tune_params::AUTOPREFETCHER_WEAK
:
10478 case tune_params::AUTOPREFETCHER_STRONG
:
10479 queue_depth
= max_insn_queue_index
+ 1;
10482 gcc_unreachable ();
10485 /* We don't mind passing in global_options_set here as we don't use
10486 the *options_set structs anyway. */
10487 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
,
10489 opts
->x_param_values
,
10490 global_options_set
.x_param_values
);
10492 /* Set up parameters to be used in prefetching algorithm. Do not
10493 override the defaults unless we are tuning for a core we have
10494 researched values for. */
10495 if (aarch64_tune_params
.prefetch
->num_slots
> 0)
10496 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES
,
10497 aarch64_tune_params
.prefetch
->num_slots
,
10498 opts
->x_param_values
,
10499 global_options_set
.x_param_values
);
10500 if (aarch64_tune_params
.prefetch
->l1_cache_size
>= 0)
10501 maybe_set_param_value (PARAM_L1_CACHE_SIZE
,
10502 aarch64_tune_params
.prefetch
->l1_cache_size
,
10503 opts
->x_param_values
,
10504 global_options_set
.x_param_values
);
10505 if (aarch64_tune_params
.prefetch
->l1_cache_line_size
>= 0)
10506 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE
,
10507 aarch64_tune_params
.prefetch
->l1_cache_line_size
,
10508 opts
->x_param_values
,
10509 global_options_set
.x_param_values
);
10510 if (aarch64_tune_params
.prefetch
->l2_cache_size
>= 0)
10511 maybe_set_param_value (PARAM_L2_CACHE_SIZE
,
10512 aarch64_tune_params
.prefetch
->l2_cache_size
,
10513 opts
->x_param_values
,
10514 global_options_set
.x_param_values
);
10515 if (!aarch64_tune_params
.prefetch
->prefetch_dynamic_strides
)
10516 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES
,
10518 opts
->x_param_values
,
10519 global_options_set
.x_param_values
);
10520 if (aarch64_tune_params
.prefetch
->minimum_stride
>= 0)
10521 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE
,
10522 aarch64_tune_params
.prefetch
->minimum_stride
,
10523 opts
->x_param_values
,
10524 global_options_set
.x_param_values
);
10526 /* Use the alternative scheduling-pressure algorithm by default. */
10527 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM
, SCHED_PRESSURE_MODEL
,
10528 opts
->x_param_values
,
10529 global_options_set
.x_param_values
);
10531 /* Enable sw prefetching at specified optimization level for
10532 CPUS that have prefetch. Lower optimization level threshold by 1
10533 when profiling is enabled. */
10534 if (opts
->x_flag_prefetch_loop_arrays
< 0
10535 && !opts
->x_optimize_size
10536 && aarch64_tune_params
.prefetch
->default_opt_level
>= 0
10537 && opts
->x_optimize
>= aarch64_tune_params
.prefetch
->default_opt_level
)
10538 opts
->x_flag_prefetch_loop_arrays
= 1;
10540 aarch64_override_options_after_change_1 (opts
);
10543 /* Print a hint with a suggestion for a core or architecture name that
10544 most closely resembles what the user passed in STR. ARCH is true if
10545 the user is asking for an architecture name. ARCH is false if the user
10546 is asking for a core name. */
10549 aarch64_print_hint_for_core_or_arch (const char *str
, bool arch
)
10551 auto_vec
<const char *> candidates
;
10552 const struct processor
*entry
= arch
? all_architectures
: all_cores
;
10553 for (; entry
->name
!= NULL
; entry
++)
10554 candidates
.safe_push (entry
->name
);
10556 #ifdef HAVE_LOCAL_CPU_DETECT
10557 /* Add also "native" as possible value. */
10559 candidates
.safe_push ("native");
10563 const char *hint
= candidates_list_and_hint (str
, s
, candidates
);
10565 inform (input_location
, "valid arguments are: %s;"
10566 " did you mean %qs?", s
, hint
);
10568 inform (input_location
, "valid arguments are: %s", s
);
10573 /* Print a hint with a suggestion for a core name that most closely resembles
10574 what the user passed in STR. */
10577 aarch64_print_hint_for_core (const char *str
)
10579 aarch64_print_hint_for_core_or_arch (str
, false);
10582 /* Print a hint with a suggestion for an architecture name that most closely
10583 resembles what the user passed in STR. */
10586 aarch64_print_hint_for_arch (const char *str
)
10588 aarch64_print_hint_for_core_or_arch (str
, true);
10591 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10592 specified in STR and throw errors if appropriate. Put the results if
10593 they are valid in RES and ISA_FLAGS. Return whether the option is
10597 aarch64_validate_mcpu (const char *str
, const struct processor
**res
,
10598 unsigned long *isa_flags
)
10600 enum aarch64_parse_opt_result parse_res
10601 = aarch64_parse_cpu (str
, res
, isa_flags
);
10603 if (parse_res
== AARCH64_PARSE_OK
)
10608 case AARCH64_PARSE_MISSING_ARG
:
10609 error ("missing cpu name in %<-mcpu=%s%>", str
);
10611 case AARCH64_PARSE_INVALID_ARG
:
10612 error ("unknown value %qs for -mcpu", str
);
10613 aarch64_print_hint_for_core (str
);
10615 case AARCH64_PARSE_INVALID_FEATURE
:
10616 error ("invalid feature modifier in %<-mcpu=%s%>", str
);
10619 gcc_unreachable ();
10625 /* Validate a command-line -march option. Parse the arch and extensions
10626 (if any) specified in STR and throw errors if appropriate. Put the
10627 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10628 option is valid. */
10631 aarch64_validate_march (const char *str
, const struct processor
**res
,
10632 unsigned long *isa_flags
)
10634 enum aarch64_parse_opt_result parse_res
10635 = aarch64_parse_arch (str
, res
, isa_flags
);
10637 if (parse_res
== AARCH64_PARSE_OK
)
10642 case AARCH64_PARSE_MISSING_ARG
:
10643 error ("missing arch name in %<-march=%s%>", str
);
10645 case AARCH64_PARSE_INVALID_ARG
:
10646 error ("unknown value %qs for -march", str
);
10647 aarch64_print_hint_for_arch (str
);
10649 case AARCH64_PARSE_INVALID_FEATURE
:
10650 error ("invalid feature modifier in %<-march=%s%>", str
);
10653 gcc_unreachable ();
10659 /* Validate a command-line -mtune option. Parse the cpu
10660 specified in STR and throw errors if appropriate. Put the
10661 result, if it is valid, in RES. Return whether the option is
10665 aarch64_validate_mtune (const char *str
, const struct processor
**res
)
10667 enum aarch64_parse_opt_result parse_res
10668 = aarch64_parse_tune (str
, res
);
10670 if (parse_res
== AARCH64_PARSE_OK
)
10675 case AARCH64_PARSE_MISSING_ARG
:
10676 error ("missing cpu name in %<-mtune=%s%>", str
);
10678 case AARCH64_PARSE_INVALID_ARG
:
10679 error ("unknown value %qs for -mtune", str
);
10680 aarch64_print_hint_for_core (str
);
10683 gcc_unreachable ();
10688 /* Return the CPU corresponding to the enum CPU.
10689 If it doesn't specify a cpu, return the default. */
10691 static const struct processor
*
10692 aarch64_get_tune_cpu (enum aarch64_processor cpu
)
10694 if (cpu
!= aarch64_none
)
10695 return &all_cores
[cpu
];
10697 /* The & 0x3f is to extract the bottom 6 bits that encode the
10698 default cpu as selected by the --with-cpu GCC configure option
10700 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10701 flags mechanism should be reworked to make it more sane. */
10702 return &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10705 /* Return the architecture corresponding to the enum ARCH.
10706 If it doesn't specify a valid architecture, return the default. */
10708 static const struct processor
*
10709 aarch64_get_arch (enum aarch64_arch arch
)
10711 if (arch
!= aarch64_no_arch
)
10712 return &all_architectures
[arch
];
10714 const struct processor
*cpu
= &all_cores
[TARGET_CPU_DEFAULT
& 0x3f];
10716 return &all_architectures
[cpu
->arch
];
10719 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10722 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value
)
10724 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10725 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10726 deciding which .md file patterns to use and when deciding whether
10727 something is a legitimate address or constant. */
10728 if (value
== SVE_SCALABLE
|| value
== SVE_128
)
10729 return poly_uint16 (2, 2);
10731 return (int) value
/ 64;
10734 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10735 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10736 tuning structs. In particular it must set selected_tune and
10737 aarch64_isa_flags that define the available ISA features and tuning
10738 decisions. It must also set selected_arch as this will be used to
10739 output the .arch asm tags for each function. */
10742 aarch64_override_options (void)
10744 unsigned long cpu_isa
= 0;
10745 unsigned long arch_isa
= 0;
10746 aarch64_isa_flags
= 0;
10748 bool valid_cpu
= true;
10749 bool valid_tune
= true;
10750 bool valid_arch
= true;
10752 selected_cpu
= NULL
;
10753 selected_arch
= NULL
;
10754 selected_tune
= NULL
;
10756 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10757 If either of -march or -mtune is given, they override their
10758 respective component of -mcpu. */
10759 if (aarch64_cpu_string
)
10760 valid_cpu
= aarch64_validate_mcpu (aarch64_cpu_string
, &selected_cpu
,
10763 if (aarch64_arch_string
)
10764 valid_arch
= aarch64_validate_march (aarch64_arch_string
, &selected_arch
,
10767 if (aarch64_tune_string
)
10768 valid_tune
= aarch64_validate_mtune (aarch64_tune_string
, &selected_tune
);
10770 /* If the user did not specify a processor, choose the default
10771 one for them. This will be the CPU set during configuration using
10772 --with-cpu, otherwise it is "generic". */
10777 selected_cpu
= &all_cores
[selected_arch
->ident
];
10778 aarch64_isa_flags
= arch_isa
;
10779 explicit_arch
= selected_arch
->arch
;
10783 /* Get default configure-time CPU. */
10784 selected_cpu
= aarch64_get_tune_cpu (aarch64_none
);
10785 aarch64_isa_flags
= TARGET_CPU_DEFAULT
>> 6;
10789 explicit_tune_core
= selected_tune
->ident
;
10791 /* If both -mcpu and -march are specified check that they are architecturally
10792 compatible, warn if they're not and prefer the -march ISA flags. */
10793 else if (selected_arch
)
10795 if (selected_arch
->arch
!= selected_cpu
->arch
)
10797 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10798 all_architectures
[selected_cpu
->arch
].name
,
10799 selected_arch
->name
);
10801 aarch64_isa_flags
= arch_isa
;
10802 explicit_arch
= selected_arch
->arch
;
10803 explicit_tune_core
= selected_tune
? selected_tune
->ident
10804 : selected_cpu
->ident
;
10808 /* -mcpu but no -march. */
10809 aarch64_isa_flags
= cpu_isa
;
10810 explicit_tune_core
= selected_tune
? selected_tune
->ident
10811 : selected_cpu
->ident
;
10812 gcc_assert (selected_cpu
);
10813 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10814 explicit_arch
= selected_arch
->arch
;
10817 /* Set the arch as well as we will need it when outputing
10818 the .arch directive in assembly. */
10819 if (!selected_arch
)
10821 gcc_assert (selected_cpu
);
10822 selected_arch
= &all_architectures
[selected_cpu
->arch
];
10825 if (!selected_tune
)
10826 selected_tune
= selected_cpu
;
10828 #ifndef HAVE_AS_MABI_OPTION
10829 /* The compiler may have been configured with 2.23.* binutils, which does
10830 not have support for ILP32. */
10832 error ("assembler does not support -mabi=ilp32");
10835 /* Convert -msve-vector-bits to a VG count. */
10836 aarch64_sve_vg
= aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits
);
10838 if (aarch64_ra_sign_scope
!= AARCH64_FUNCTION_NONE
&& TARGET_ILP32
)
10839 sorry ("return address signing is only supported for -mabi=lp64");
10841 /* Make sure we properly set up the explicit options. */
10842 if ((aarch64_cpu_string
&& valid_cpu
)
10843 || (aarch64_tune_string
&& valid_tune
))
10844 gcc_assert (explicit_tune_core
!= aarch64_none
);
10846 if ((aarch64_cpu_string
&& valid_cpu
)
10847 || (aarch64_arch_string
&& valid_arch
))
10848 gcc_assert (explicit_arch
!= aarch64_no_arch
);
10850 aarch64_override_options_internal (&global_options
);
10852 /* Save these options as the default ones in case we push and pop them later
10853 while processing functions with potential target attributes. */
10854 target_option_default_node
= target_option_current_node
10855 = build_target_option_node (&global_options
);
10858 /* Implement targetm.override_options_after_change. */
10861 aarch64_override_options_after_change (void)
10863 aarch64_override_options_after_change_1 (&global_options
);
10866 static struct machine_function
*
10867 aarch64_init_machine_status (void)
10869 struct machine_function
*machine
;
10870 machine
= ggc_cleared_alloc
<machine_function
> ();
10875 aarch64_init_expanders (void)
10877 init_machine_status
= aarch64_init_machine_status
;
10880 /* A checking mechanism for the implementation of the various code models. */
10882 initialize_aarch64_code_model (struct gcc_options
*opts
)
10884 if (opts
->x_flag_pic
)
10886 switch (opts
->x_aarch64_cmodel_var
)
10888 case AARCH64_CMODEL_TINY
:
10889 aarch64_cmodel
= AARCH64_CMODEL_TINY_PIC
;
10891 case AARCH64_CMODEL_SMALL
:
10892 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10893 aarch64_cmodel
= (flag_pic
== 2
10894 ? AARCH64_CMODEL_SMALL_PIC
10895 : AARCH64_CMODEL_SMALL_SPIC
);
10897 aarch64_cmodel
= AARCH64_CMODEL_SMALL_PIC
;
10900 case AARCH64_CMODEL_LARGE
:
10901 sorry ("code model %qs with -f%s", "large",
10902 opts
->x_flag_pic
> 1 ? "PIC" : "pic");
10905 gcc_unreachable ();
10909 aarch64_cmodel
= opts
->x_aarch64_cmodel_var
;
10912 /* Implement TARGET_OPTION_SAVE. */
10915 aarch64_option_save (struct cl_target_option
*ptr
, struct gcc_options
*opts
)
10917 ptr
->x_aarch64_override_tune_string
= opts
->x_aarch64_override_tune_string
;
10920 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10921 using the information saved in PTR. */
10924 aarch64_option_restore (struct gcc_options
*opts
, struct cl_target_option
*ptr
)
10926 opts
->x_explicit_tune_core
= ptr
->x_explicit_tune_core
;
10927 selected_tune
= aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
10928 opts
->x_explicit_arch
= ptr
->x_explicit_arch
;
10929 selected_arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
10930 opts
->x_aarch64_override_tune_string
= ptr
->x_aarch64_override_tune_string
;
10932 aarch64_override_options_internal (opts
);
10935 /* Implement TARGET_OPTION_PRINT. */
10938 aarch64_option_print (FILE *file
, int indent
, struct cl_target_option
*ptr
)
10940 const struct processor
*cpu
10941 = aarch64_get_tune_cpu (ptr
->x_explicit_tune_core
);
10942 unsigned long isa_flags
= ptr
->x_aarch64_isa_flags
;
10943 const struct processor
*arch
= aarch64_get_arch (ptr
->x_explicit_arch
);
10944 std::string extension
10945 = aarch64_get_extension_string_for_isa_flags (isa_flags
, arch
->flags
);
10947 fprintf (file
, "%*sselected tune = %s\n", indent
, "", cpu
->name
);
10948 fprintf (file
, "%*sselected arch = %s%s\n", indent
, "",
10949 arch
->name
, extension
.c_str ());
10952 static GTY(()) tree aarch64_previous_fndecl
;
10955 aarch64_reset_previous_fndecl (void)
10957 aarch64_previous_fndecl
= NULL
;
10960 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10961 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10962 make sure optab availability predicates are recomputed when necessary. */
10965 aarch64_save_restore_target_globals (tree new_tree
)
10967 if (TREE_TARGET_GLOBALS (new_tree
))
10968 restore_target_globals (TREE_TARGET_GLOBALS (new_tree
));
10969 else if (new_tree
== target_option_default_node
)
10970 restore_target_globals (&default_target_globals
);
10972 TREE_TARGET_GLOBALS (new_tree
) = save_target_globals_default_opts ();
10975 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
10976 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10977 of the function, if such exists. This function may be called multiple
10978 times on a single function so use aarch64_previous_fndecl to avoid
10979 setting up identical state. */
10982 aarch64_set_current_function (tree fndecl
)
10984 if (!fndecl
|| fndecl
== aarch64_previous_fndecl
)
10987 tree old_tree
= (aarch64_previous_fndecl
10988 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl
)
10991 tree new_tree
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
10993 /* If current function has no attributes but the previous one did,
10994 use the default node. */
10995 if (!new_tree
&& old_tree
)
10996 new_tree
= target_option_default_node
;
10998 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
10999 the default have been handled by aarch64_save_restore_target_globals from
11000 aarch64_pragma_target_parse. */
11001 if (old_tree
== new_tree
)
11004 aarch64_previous_fndecl
= fndecl
;
11006 /* First set the target options. */
11007 cl_target_option_restore (&global_options
, TREE_TARGET_OPTION (new_tree
));
11009 aarch64_save_restore_target_globals (new_tree
);
11012 /* Enum describing the various ways we can handle attributes.
11013 In many cases we can reuse the generic option handling machinery. */
11015 enum aarch64_attr_opt_type
11017 aarch64_attr_mask
, /* Attribute should set a bit in target_flags. */
11018 aarch64_attr_bool
, /* Attribute sets or unsets a boolean variable. */
11019 aarch64_attr_enum
, /* Attribute sets an enum variable. */
11020 aarch64_attr_custom
/* Attribute requires a custom handling function. */
11023 /* All the information needed to handle a target attribute.
11024 NAME is the name of the attribute.
11025 ATTR_TYPE specifies the type of behavior of the attribute as described
11026 in the definition of enum aarch64_attr_opt_type.
11027 ALLOW_NEG is true if the attribute supports a "no-" form.
11028 HANDLER is the function that takes the attribute string as an argument
11029 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11030 OPT_NUM is the enum specifying the option that the attribute modifies.
11031 This is needed for attributes that mirror the behavior of a command-line
11032 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11033 aarch64_attr_enum. */
11035 struct aarch64_attribute_info
11038 enum aarch64_attr_opt_type attr_type
;
11040 bool (*handler
) (const char *);
11041 enum opt_code opt_num
;
11044 /* Handle the ARCH_STR argument to the arch= target attribute. */
11047 aarch64_handle_attr_arch (const char *str
)
11049 const struct processor
*tmp_arch
= NULL
;
11050 enum aarch64_parse_opt_result parse_res
11051 = aarch64_parse_arch (str
, &tmp_arch
, &aarch64_isa_flags
);
11053 if (parse_res
== AARCH64_PARSE_OK
)
11055 gcc_assert (tmp_arch
);
11056 selected_arch
= tmp_arch
;
11057 explicit_arch
= selected_arch
->arch
;
11063 case AARCH64_PARSE_MISSING_ARG
:
11064 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11066 case AARCH64_PARSE_INVALID_ARG
:
11067 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str
);
11068 aarch64_print_hint_for_arch (str
);
11070 case AARCH64_PARSE_INVALID_FEATURE
:
11071 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11074 gcc_unreachable ();
11080 /* Handle the argument CPU_STR to the cpu= target attribute. */
11083 aarch64_handle_attr_cpu (const char *str
)
11085 const struct processor
*tmp_cpu
= NULL
;
11086 enum aarch64_parse_opt_result parse_res
11087 = aarch64_parse_cpu (str
, &tmp_cpu
, &aarch64_isa_flags
);
11089 if (parse_res
== AARCH64_PARSE_OK
)
11091 gcc_assert (tmp_cpu
);
11092 selected_tune
= tmp_cpu
;
11093 explicit_tune_core
= selected_tune
->ident
;
11095 selected_arch
= &all_architectures
[tmp_cpu
->arch
];
11096 explicit_arch
= selected_arch
->arch
;
11102 case AARCH64_PARSE_MISSING_ARG
:
11103 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11105 case AARCH64_PARSE_INVALID_ARG
:
11106 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str
);
11107 aarch64_print_hint_for_core (str
);
11109 case AARCH64_PARSE_INVALID_FEATURE
:
11110 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11113 gcc_unreachable ();
11119 /* Handle the argument STR to the tune= target attribute. */
11122 aarch64_handle_attr_tune (const char *str
)
11124 const struct processor
*tmp_tune
= NULL
;
11125 enum aarch64_parse_opt_result parse_res
11126 = aarch64_parse_tune (str
, &tmp_tune
);
11128 if (parse_res
== AARCH64_PARSE_OK
)
11130 gcc_assert (tmp_tune
);
11131 selected_tune
= tmp_tune
;
11132 explicit_tune_core
= selected_tune
->ident
;
11138 case AARCH64_PARSE_INVALID_ARG
:
11139 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str
);
11140 aarch64_print_hint_for_core (str
);
11143 gcc_unreachable ();
11149 /* Parse an architecture extensions target attribute string specified in STR.
11150 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11151 if successful. Update aarch64_isa_flags to reflect the ISA features
11155 aarch64_handle_attr_isa_flags (char *str
)
11157 enum aarch64_parse_opt_result parse_res
;
11158 unsigned long isa_flags
= aarch64_isa_flags
;
11160 /* We allow "+nothing" in the beginning to clear out all architectural
11161 features if the user wants to handpick specific features. */
11162 if (strncmp ("+nothing", str
, 8) == 0)
11168 parse_res
= aarch64_parse_extension (str
, &isa_flags
);
11170 if (parse_res
== AARCH64_PARSE_OK
)
11172 aarch64_isa_flags
= isa_flags
;
11178 case AARCH64_PARSE_MISSING_ARG
:
11179 error ("missing value in %<target()%> pragma or attribute");
11182 case AARCH64_PARSE_INVALID_FEATURE
:
11183 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str
);
11187 gcc_unreachable ();
11193 /* The target attributes that we support. On top of these we also support just
11194 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11195 handled explicitly in aarch64_process_one_target_attr. */
11197 static const struct aarch64_attribute_info aarch64_attributes
[] =
11199 { "general-regs-only", aarch64_attr_mask
, false, NULL
,
11200 OPT_mgeneral_regs_only
},
11201 { "fix-cortex-a53-835769", aarch64_attr_bool
, true, NULL
,
11202 OPT_mfix_cortex_a53_835769
},
11203 { "fix-cortex-a53-843419", aarch64_attr_bool
, true, NULL
,
11204 OPT_mfix_cortex_a53_843419
},
11205 { "cmodel", aarch64_attr_enum
, false, NULL
, OPT_mcmodel_
},
11206 { "strict-align", aarch64_attr_mask
, true, NULL
, OPT_mstrict_align
},
11207 { "omit-leaf-frame-pointer", aarch64_attr_bool
, true, NULL
,
11208 OPT_momit_leaf_frame_pointer
},
11209 { "tls-dialect", aarch64_attr_enum
, false, NULL
, OPT_mtls_dialect_
},
11210 { "arch", aarch64_attr_custom
, false, aarch64_handle_attr_arch
,
11212 { "cpu", aarch64_attr_custom
, false, aarch64_handle_attr_cpu
, OPT_mcpu_
},
11213 { "tune", aarch64_attr_custom
, false, aarch64_handle_attr_tune
,
11215 { "sign-return-address", aarch64_attr_enum
, false, NULL
,
11216 OPT_msign_return_address_
},
11217 { NULL
, aarch64_attr_custom
, false, NULL
, OPT____
}
11220 /* Parse ARG_STR which contains the definition of one target attribute.
11221 Show appropriate errors if any or return true if the attribute is valid. */
11224 aarch64_process_one_target_attr (char *arg_str
)
11226 bool invert
= false;
11228 size_t len
= strlen (arg_str
);
11232 error ("malformed %<target()%> pragma or attribute");
11236 char *str_to_check
= (char *) alloca (len
+ 1);
11237 strcpy (str_to_check
, arg_str
);
11239 /* Skip leading whitespace. */
11240 while (*str_to_check
== ' ' || *str_to_check
== '\t')
11243 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11244 It is easier to detect and handle it explicitly here rather than going
11245 through the machinery for the rest of the target attributes in this
11247 if (*str_to_check
== '+')
11248 return aarch64_handle_attr_isa_flags (str_to_check
);
11250 if (len
> 3 && strncmp (str_to_check
, "no-", 3) == 0)
11255 char *arg
= strchr (str_to_check
, '=');
11257 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11258 and point ARG to "foo". */
11264 const struct aarch64_attribute_info
*p_attr
;
11265 bool found
= false;
11266 for (p_attr
= aarch64_attributes
; p_attr
->name
; p_attr
++)
11268 /* If the names don't match up, or the user has given an argument
11269 to an attribute that doesn't accept one, or didn't give an argument
11270 to an attribute that expects one, fail to match. */
11271 if (strcmp (str_to_check
, p_attr
->name
) != 0)
11275 bool attr_need_arg_p
= p_attr
->attr_type
== aarch64_attr_custom
11276 || p_attr
->attr_type
== aarch64_attr_enum
;
11278 if (attr_need_arg_p
^ (arg
!= NULL
))
11280 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check
);
11284 /* If the name matches but the attribute does not allow "no-" versions
11285 then we can't match. */
11286 if (invert
&& !p_attr
->allow_neg
)
11288 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check
);
11292 switch (p_attr
->attr_type
)
11294 /* Has a custom handler registered.
11295 For example, cpu=, arch=, tune=. */
11296 case aarch64_attr_custom
:
11297 gcc_assert (p_attr
->handler
);
11298 if (!p_attr
->handler (arg
))
11302 /* Either set or unset a boolean option. */
11303 case aarch64_attr_bool
:
11305 struct cl_decoded_option decoded
;
11307 generate_option (p_attr
->opt_num
, NULL
, !invert
,
11308 CL_TARGET
, &decoded
);
11309 aarch64_handle_option (&global_options
, &global_options_set
,
11310 &decoded
, input_location
);
11313 /* Set or unset a bit in the target_flags. aarch64_handle_option
11314 should know what mask to apply given the option number. */
11315 case aarch64_attr_mask
:
11317 struct cl_decoded_option decoded
;
11318 /* We only need to specify the option number.
11319 aarch64_handle_option will know which mask to apply. */
11320 decoded
.opt_index
= p_attr
->opt_num
;
11321 decoded
.value
= !invert
;
11322 aarch64_handle_option (&global_options
, &global_options_set
,
11323 &decoded
, input_location
);
11326 /* Use the option setting machinery to set an option to an enum. */
11327 case aarch64_attr_enum
:
11332 valid
= opt_enum_arg_to_value (p_attr
->opt_num
, arg
,
11333 &value
, CL_TARGET
);
11336 set_option (&global_options
, NULL
, p_attr
->opt_num
, value
,
11337 NULL
, DK_UNSPECIFIED
, input_location
,
11342 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check
, arg
);
11347 gcc_unreachable ();
11351 /* If we reached here we either have found an attribute and validated
11352 it or didn't match any. If we matched an attribute but its arguments
11353 were malformed we will have returned false already. */
11357 /* Count how many times the character C appears in
11358 NULL-terminated string STR. */
11360 static unsigned int
11361 num_occurences_in_str (char c
, char *str
)
11363 unsigned int res
= 0;
11364 while (*str
!= '\0')
11375 /* Parse the tree in ARGS that contains the target attribute information
11376 and update the global target options space. */
11379 aarch64_process_target_attr (tree args
)
11381 if (TREE_CODE (args
) == TREE_LIST
)
11385 tree head
= TREE_VALUE (args
);
11388 if (!aarch64_process_target_attr (head
))
11391 args
= TREE_CHAIN (args
);
11397 if (TREE_CODE (args
) != STRING_CST
)
11399 error ("attribute %<target%> argument not a string");
11403 size_t len
= strlen (TREE_STRING_POINTER (args
));
11404 char *str_to_check
= (char *) alloca (len
+ 1);
11405 strcpy (str_to_check
, TREE_STRING_POINTER (args
));
11409 error ("malformed %<target()%> pragma or attribute");
11413 /* Used to catch empty spaces between commas i.e.
11414 attribute ((target ("attr1,,attr2"))). */
11415 unsigned int num_commas
= num_occurences_in_str (',', str_to_check
);
11417 /* Handle multiple target attributes separated by ','. */
11418 char *token
= strtok (str_to_check
, ",");
11420 unsigned int num_attrs
= 0;
11424 if (!aarch64_process_one_target_attr (token
))
11426 error ("pragma or attribute %<target(\"%s\")%> is not valid", token
);
11430 token
= strtok (NULL
, ",");
11433 if (num_attrs
!= num_commas
+ 1)
11435 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args
));
11442 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11443 process attribute ((target ("..."))). */
11446 aarch64_option_valid_attribute_p (tree fndecl
, tree
, tree args
, int)
11448 struct cl_target_option cur_target
;
11451 tree new_target
, new_optimize
;
11452 tree existing_target
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
11454 /* If what we're processing is the current pragma string then the
11455 target option node is already stored in target_option_current_node
11456 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11457 having to re-parse the string. This is especially useful to keep
11458 arm_neon.h compile times down since that header contains a lot
11459 of intrinsics enclosed in pragmas. */
11460 if (!existing_target
&& args
== current_target_pragma
)
11462 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = target_option_current_node
;
11465 tree func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11467 old_optimize
= build_optimization_node (&global_options
);
11468 func_optimize
= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
);
11470 /* If the function changed the optimization levels as well as setting
11471 target options, start with the optimizations specified. */
11472 if (func_optimize
&& func_optimize
!= old_optimize
)
11473 cl_optimization_restore (&global_options
,
11474 TREE_OPTIMIZATION (func_optimize
));
11476 /* Save the current target options to restore at the end. */
11477 cl_target_option_save (&cur_target
, &global_options
);
11479 /* If fndecl already has some target attributes applied to it, unpack
11480 them so that we add this attribute on top of them, rather than
11481 overwriting them. */
11482 if (existing_target
)
11484 struct cl_target_option
*existing_options
11485 = TREE_TARGET_OPTION (existing_target
);
11487 if (existing_options
)
11488 cl_target_option_restore (&global_options
, existing_options
);
11491 cl_target_option_restore (&global_options
,
11492 TREE_TARGET_OPTION (target_option_current_node
));
11494 ret
= aarch64_process_target_attr (args
);
11496 /* Set up any additional state. */
11499 aarch64_override_options_internal (&global_options
);
11500 /* Initialize SIMD builtins if we haven't already.
11501 Set current_target_pragma to NULL for the duration so that
11502 the builtin initialization code doesn't try to tag the functions
11503 being built with the attributes specified by any current pragma, thus
11504 going into an infinite recursion. */
11507 tree saved_current_target_pragma
= current_target_pragma
;
11508 current_target_pragma
= NULL
;
11509 aarch64_init_simd_builtins ();
11510 current_target_pragma
= saved_current_target_pragma
;
11512 new_target
= build_target_option_node (&global_options
);
11517 new_optimize
= build_optimization_node (&global_options
);
11521 DECL_FUNCTION_SPECIFIC_TARGET (fndecl
) = new_target
;
11523 if (old_optimize
!= new_optimize
)
11524 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl
) = new_optimize
;
11527 cl_target_option_restore (&global_options
, &cur_target
);
11529 if (old_optimize
!= new_optimize
)
11530 cl_optimization_restore (&global_options
,
11531 TREE_OPTIMIZATION (old_optimize
));
11535 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11536 tri-bool options (yes, no, don't care) and the default value is
11537 DEF, determine whether to reject inlining. */
11540 aarch64_tribools_ok_for_inlining_p (int caller
, int callee
,
11541 int dont_care
, int def
)
11543 /* If the callee doesn't care, always allow inlining. */
11544 if (callee
== dont_care
)
11547 /* If the caller doesn't care, always allow inlining. */
11548 if (caller
== dont_care
)
11551 /* Otherwise, allow inlining if either the callee and caller values
11552 agree, or if the callee is using the default value. */
11553 return (callee
== caller
|| callee
== def
);
11556 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11557 to inline CALLEE into CALLER based on target-specific info.
11558 Make sure that the caller and callee have compatible architectural
11559 features. Then go through the other possible target attributes
11560 and see if they can block inlining. Try not to reject always_inline
11561 callees unless they are incompatible architecturally. */
11564 aarch64_can_inline_p (tree caller
, tree callee
)
11566 tree caller_tree
= DECL_FUNCTION_SPECIFIC_TARGET (caller
);
11567 tree callee_tree
= DECL_FUNCTION_SPECIFIC_TARGET (callee
);
11569 struct cl_target_option
*caller_opts
11570 = TREE_TARGET_OPTION (caller_tree
? caller_tree
11571 : target_option_default_node
);
11573 struct cl_target_option
*callee_opts
11574 = TREE_TARGET_OPTION (callee_tree
? callee_tree
11575 : target_option_default_node
);
11577 /* Callee's ISA flags should be a subset of the caller's. */
11578 if ((caller_opts
->x_aarch64_isa_flags
& callee_opts
->x_aarch64_isa_flags
)
11579 != callee_opts
->x_aarch64_isa_flags
)
11582 /* Allow non-strict aligned functions inlining into strict
11584 if ((TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)
11585 != TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
))
11586 && !(!TARGET_STRICT_ALIGN_P (callee_opts
->x_target_flags
)
11587 && TARGET_STRICT_ALIGN_P (caller_opts
->x_target_flags
)))
11590 bool always_inline
= lookup_attribute ("always_inline",
11591 DECL_ATTRIBUTES (callee
));
11593 /* If the architectural features match up and the callee is always_inline
11594 then the other attributes don't matter. */
11598 if (caller_opts
->x_aarch64_cmodel_var
11599 != callee_opts
->x_aarch64_cmodel_var
)
11602 if (caller_opts
->x_aarch64_tls_dialect
11603 != callee_opts
->x_aarch64_tls_dialect
)
11606 /* Honour explicit requests to workaround errata. */
11607 if (!aarch64_tribools_ok_for_inlining_p (
11608 caller_opts
->x_aarch64_fix_a53_err835769
,
11609 callee_opts
->x_aarch64_fix_a53_err835769
,
11610 2, TARGET_FIX_ERR_A53_835769_DEFAULT
))
11613 if (!aarch64_tribools_ok_for_inlining_p (
11614 caller_opts
->x_aarch64_fix_a53_err843419
,
11615 callee_opts
->x_aarch64_fix_a53_err843419
,
11616 2, TARGET_FIX_ERR_A53_843419
))
11619 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11620 caller and calle and they don't match up, reject inlining. */
11621 if (!aarch64_tribools_ok_for_inlining_p (
11622 caller_opts
->x_flag_omit_leaf_frame_pointer
,
11623 callee_opts
->x_flag_omit_leaf_frame_pointer
,
11627 /* If the callee has specific tuning overrides, respect them. */
11628 if (callee_opts
->x_aarch64_override_tune_string
!= NULL
11629 && caller_opts
->x_aarch64_override_tune_string
== NULL
)
11632 /* If the user specified tuning override strings for the
11633 caller and callee and they don't match up, reject inlining.
11634 We just do a string compare here, we don't analyze the meaning
11635 of the string, as it would be too costly for little gain. */
11636 if (callee_opts
->x_aarch64_override_tune_string
11637 && caller_opts
->x_aarch64_override_tune_string
11638 && (strcmp (callee_opts
->x_aarch64_override_tune_string
,
11639 caller_opts
->x_aarch64_override_tune_string
) != 0))
11645 /* Return true if SYMBOL_REF X binds locally. */
11648 aarch64_symbol_binds_local_p (const_rtx x
)
11650 return (SYMBOL_REF_DECL (x
)
11651 ? targetm
.binds_local_p (SYMBOL_REF_DECL (x
))
11652 : SYMBOL_REF_LOCAL_P (x
));
11655 /* Return true if SYMBOL_REF X is thread local */
11657 aarch64_tls_symbol_p (rtx x
)
11659 if (! TARGET_HAVE_TLS
)
11662 if (GET_CODE (x
) != SYMBOL_REF
)
11665 return SYMBOL_REF_TLS_MODEL (x
) != 0;
11668 /* Classify a TLS symbol into one of the TLS kinds. */
11669 enum aarch64_symbol_type
11670 aarch64_classify_tls_symbol (rtx x
)
11672 enum tls_model tls_kind
= tls_symbolic_operand_type (x
);
11676 case TLS_MODEL_GLOBAL_DYNAMIC
:
11677 case TLS_MODEL_LOCAL_DYNAMIC
:
11678 return TARGET_TLS_DESC
? SYMBOL_SMALL_TLSDESC
: SYMBOL_SMALL_TLSGD
;
11680 case TLS_MODEL_INITIAL_EXEC
:
11681 switch (aarch64_cmodel
)
11683 case AARCH64_CMODEL_TINY
:
11684 case AARCH64_CMODEL_TINY_PIC
:
11685 return SYMBOL_TINY_TLSIE
;
11687 return SYMBOL_SMALL_TLSIE
;
11690 case TLS_MODEL_LOCAL_EXEC
:
11691 if (aarch64_tls_size
== 12)
11692 return SYMBOL_TLSLE12
;
11693 else if (aarch64_tls_size
== 24)
11694 return SYMBOL_TLSLE24
;
11695 else if (aarch64_tls_size
== 32)
11696 return SYMBOL_TLSLE32
;
11697 else if (aarch64_tls_size
== 48)
11698 return SYMBOL_TLSLE48
;
11700 gcc_unreachable ();
11702 case TLS_MODEL_EMULATED
:
11703 case TLS_MODEL_NONE
:
11704 return SYMBOL_FORCE_TO_MEM
;
11707 gcc_unreachable ();
11711 /* Return the correct method for accessing X + OFFSET, where X is either
11712 a SYMBOL_REF or LABEL_REF. */
11714 enum aarch64_symbol_type
11715 aarch64_classify_symbol (rtx x
, HOST_WIDE_INT offset
)
11717 if (GET_CODE (x
) == LABEL_REF
)
11719 switch (aarch64_cmodel
)
11721 case AARCH64_CMODEL_LARGE
:
11722 return SYMBOL_FORCE_TO_MEM
;
11724 case AARCH64_CMODEL_TINY_PIC
:
11725 case AARCH64_CMODEL_TINY
:
11726 return SYMBOL_TINY_ABSOLUTE
;
11728 case AARCH64_CMODEL_SMALL_SPIC
:
11729 case AARCH64_CMODEL_SMALL_PIC
:
11730 case AARCH64_CMODEL_SMALL
:
11731 return SYMBOL_SMALL_ABSOLUTE
;
11734 gcc_unreachable ();
11738 if (GET_CODE (x
) == SYMBOL_REF
)
11740 if (aarch64_tls_symbol_p (x
))
11741 return aarch64_classify_tls_symbol (x
);
11743 switch (aarch64_cmodel
)
11745 case AARCH64_CMODEL_TINY
:
11746 /* When we retrieve symbol + offset address, we have to make sure
11747 the offset does not cause overflow of the final address. But
11748 we have no way of knowing the address of symbol at compile time
11749 so we can't accurately say if the distance between the PC and
11750 symbol + offset is outside the addressible range of +/-1M in the
11751 TINY code model. So we rely on images not being greater than
11752 1M and cap the offset at 1M and anything beyond 1M will have to
11753 be loaded using an alternative mechanism. Furthermore if the
11754 symbol is a weak reference to something that isn't known to
11755 resolve to a symbol in this module, then force to memory. */
11756 if ((SYMBOL_REF_WEAK (x
)
11757 && !aarch64_symbol_binds_local_p (x
))
11758 || !IN_RANGE (offset
, -1048575, 1048575))
11759 return SYMBOL_FORCE_TO_MEM
;
11760 return SYMBOL_TINY_ABSOLUTE
;
11762 case AARCH64_CMODEL_SMALL
:
11763 /* Same reasoning as the tiny code model, but the offset cap here is
11765 if ((SYMBOL_REF_WEAK (x
)
11766 && !aarch64_symbol_binds_local_p (x
))
11767 || !IN_RANGE (offset
, HOST_WIDE_INT_C (-4294967263),
11768 HOST_WIDE_INT_C (4294967264)))
11769 return SYMBOL_FORCE_TO_MEM
;
11770 return SYMBOL_SMALL_ABSOLUTE
;
11772 case AARCH64_CMODEL_TINY_PIC
:
11773 if (!aarch64_symbol_binds_local_p (x
))
11774 return SYMBOL_TINY_GOT
;
11775 return SYMBOL_TINY_ABSOLUTE
;
11777 case AARCH64_CMODEL_SMALL_SPIC
:
11778 case AARCH64_CMODEL_SMALL_PIC
:
11779 if (!aarch64_symbol_binds_local_p (x
))
11780 return (aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
11781 ? SYMBOL_SMALL_GOT_28K
: SYMBOL_SMALL_GOT_4G
);
11782 return SYMBOL_SMALL_ABSOLUTE
;
11784 case AARCH64_CMODEL_LARGE
:
11785 /* This is alright even in PIC code as the constant
11786 pool reference is always PC relative and within
11787 the same translation unit. */
11788 if (!aarch64_pcrelative_literal_loads
&& CONSTANT_POOL_ADDRESS_P (x
))
11789 return SYMBOL_SMALL_ABSOLUTE
;
11791 return SYMBOL_FORCE_TO_MEM
;
11794 gcc_unreachable ();
11798 /* By default push everything into the constant pool. */
11799 return SYMBOL_FORCE_TO_MEM
;
11803 aarch64_constant_address_p (rtx x
)
11805 return (CONSTANT_P (x
) && memory_address_p (DImode
, x
));
11809 aarch64_legitimate_pic_operand_p (rtx x
)
11811 if (GET_CODE (x
) == SYMBOL_REF
11812 || (GET_CODE (x
) == CONST
11813 && GET_CODE (XEXP (x
, 0)) == PLUS
11814 && GET_CODE (XEXP (XEXP (x
, 0), 0)) == SYMBOL_REF
))
11820 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11821 that should be rematerialized rather than spilled. */
11824 aarch64_legitimate_constant_p (machine_mode mode
, rtx x
)
11826 /* Support CSE and rematerialization of common constants. */
11827 if (CONST_INT_P (x
)
11828 || (CONST_DOUBLE_P (x
) && GET_MODE_CLASS (mode
) == MODE_FLOAT
)
11829 || GET_CODE (x
) == CONST_VECTOR
)
11832 /* Do not allow vector struct mode constants for Advanced SIMD.
11833 We could support 0 and -1 easily, but they need support in
11834 aarch64-simd.md. */
11835 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
11836 if (vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
11839 /* Only accept variable-length vector constants if they can be
11842 ??? It would be possible to handle rematerialization of other
11843 constants via secondary reloads. */
11844 if (vec_flags
& VEC_ANY_SVE
)
11845 return aarch64_simd_valid_immediate (x
, NULL
);
11847 if (GET_CODE (x
) == HIGH
)
11850 /* Accept polynomial constants that can be calculated by using the
11851 destination of a move as the sole temporary. Constants that
11852 require a second temporary cannot be rematerialized (they can't be
11853 forced to memory and also aren't legitimate constants). */
11855 if (poly_int_rtx_p (x
, &offset
))
11856 return aarch64_offset_temporaries (false, offset
) <= 1;
11858 /* If an offset is being added to something else, we need to allow the
11859 base to be moved into the destination register, meaning that there
11860 are no free temporaries for the offset. */
11861 x
= strip_offset (x
, &offset
);
11862 if (!offset
.is_constant () && aarch64_offset_temporaries (true, offset
) > 0)
11865 /* Do not allow const (plus (anchor_symbol, const_int)). */
11866 if (maybe_ne (offset
, 0) && SYMBOL_REF_P (x
) && SYMBOL_REF_ANCHOR_P (x
))
11869 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11870 so spilling them is better than rematerialization. */
11871 if (SYMBOL_REF_P (x
) && !SYMBOL_REF_TLS_MODEL (x
))
11874 /* Label references are always constant. */
11875 if (GET_CODE (x
) == LABEL_REF
)
11882 aarch64_load_tp (rtx target
)
11885 || GET_MODE (target
) != Pmode
11886 || !register_operand (target
, Pmode
))
11887 target
= gen_reg_rtx (Pmode
);
11889 /* Can return in any reg. */
11890 emit_insn (gen_aarch64_load_tp_hard (target
));
11894 /* On AAPCS systems, this is the "struct __va_list". */
11895 static GTY(()) tree va_list_type
;
11897 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11898 Return the type to use as __builtin_va_list.
11900 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11912 aarch64_build_builtin_va_list (void)
11915 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
11917 /* Create the type. */
11918 va_list_type
= lang_hooks
.types
.make_type (RECORD_TYPE
);
11919 /* Give it the required name. */
11920 va_list_name
= build_decl (BUILTINS_LOCATION
,
11922 get_identifier ("__va_list"),
11924 DECL_ARTIFICIAL (va_list_name
) = 1;
11925 TYPE_NAME (va_list_type
) = va_list_name
;
11926 TYPE_STUB_DECL (va_list_type
) = va_list_name
;
11928 /* Create the fields. */
11929 f_stack
= build_decl (BUILTINS_LOCATION
,
11930 FIELD_DECL
, get_identifier ("__stack"),
11932 f_grtop
= build_decl (BUILTINS_LOCATION
,
11933 FIELD_DECL
, get_identifier ("__gr_top"),
11935 f_vrtop
= build_decl (BUILTINS_LOCATION
,
11936 FIELD_DECL
, get_identifier ("__vr_top"),
11938 f_groff
= build_decl (BUILTINS_LOCATION
,
11939 FIELD_DECL
, get_identifier ("__gr_offs"),
11940 integer_type_node
);
11941 f_vroff
= build_decl (BUILTINS_LOCATION
,
11942 FIELD_DECL
, get_identifier ("__vr_offs"),
11943 integer_type_node
);
11945 /* Tell tree-stdarg pass about our internal offset fields.
11946 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11947 purpose to identify whether the code is updating va_list internal
11948 offset fields through irregular way. */
11949 va_list_gpr_counter_field
= f_groff
;
11950 va_list_fpr_counter_field
= f_vroff
;
11952 DECL_ARTIFICIAL (f_stack
) = 1;
11953 DECL_ARTIFICIAL (f_grtop
) = 1;
11954 DECL_ARTIFICIAL (f_vrtop
) = 1;
11955 DECL_ARTIFICIAL (f_groff
) = 1;
11956 DECL_ARTIFICIAL (f_vroff
) = 1;
11958 DECL_FIELD_CONTEXT (f_stack
) = va_list_type
;
11959 DECL_FIELD_CONTEXT (f_grtop
) = va_list_type
;
11960 DECL_FIELD_CONTEXT (f_vrtop
) = va_list_type
;
11961 DECL_FIELD_CONTEXT (f_groff
) = va_list_type
;
11962 DECL_FIELD_CONTEXT (f_vroff
) = va_list_type
;
11964 TYPE_FIELDS (va_list_type
) = f_stack
;
11965 DECL_CHAIN (f_stack
) = f_grtop
;
11966 DECL_CHAIN (f_grtop
) = f_vrtop
;
11967 DECL_CHAIN (f_vrtop
) = f_groff
;
11968 DECL_CHAIN (f_groff
) = f_vroff
;
11970 /* Compute its layout. */
11971 layout_type (va_list_type
);
11973 return va_list_type
;
11976 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
11978 aarch64_expand_builtin_va_start (tree valist
, rtx nextarg ATTRIBUTE_UNUSED
)
11980 const CUMULATIVE_ARGS
*cum
;
11981 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
11982 tree stack
, grtop
, vrtop
, groff
, vroff
;
11984 int gr_save_area_size
= cfun
->va_list_gpr_size
;
11985 int vr_save_area_size
= cfun
->va_list_fpr_size
;
11988 cum
= &crtl
->args
.info
;
11989 if (cfun
->va_list_gpr_size
)
11990 gr_save_area_size
= MIN ((NUM_ARG_REGS
- cum
->aapcs_ncrn
) * UNITS_PER_WORD
,
11991 cfun
->va_list_gpr_size
);
11992 if (cfun
->va_list_fpr_size
)
11993 vr_save_area_size
= MIN ((NUM_FP_ARG_REGS
- cum
->aapcs_nvrn
)
11994 * UNITS_PER_VREG
, cfun
->va_list_fpr_size
);
11998 gcc_assert (cum
->aapcs_nvrn
== 0);
11999 vr_save_area_size
= 0;
12002 f_stack
= TYPE_FIELDS (va_list_type_node
);
12003 f_grtop
= DECL_CHAIN (f_stack
);
12004 f_vrtop
= DECL_CHAIN (f_grtop
);
12005 f_groff
= DECL_CHAIN (f_vrtop
);
12006 f_vroff
= DECL_CHAIN (f_groff
);
12008 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), valist
, f_stack
,
12010 grtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
), valist
, f_grtop
,
12012 vrtop
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
), valist
, f_vrtop
,
12014 groff
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
), valist
, f_groff
,
12016 vroff
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
), valist
, f_vroff
,
12019 /* Emit code to initialize STACK, which points to the next varargs stack
12020 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12021 by named arguments. STACK is 8-byte aligned. */
12022 t
= make_tree (TREE_TYPE (stack
), virtual_incoming_args_rtx
);
12023 if (cum
->aapcs_stack_size
> 0)
12024 t
= fold_build_pointer_plus_hwi (t
, cum
->aapcs_stack_size
* UNITS_PER_WORD
);
12025 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), stack
, t
);
12026 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12028 /* Emit code to initialize GRTOP, the top of the GR save area.
12029 virtual_incoming_args_rtx should have been 16 byte aligned. */
12030 t
= make_tree (TREE_TYPE (grtop
), virtual_incoming_args_rtx
);
12031 t
= build2 (MODIFY_EXPR
, TREE_TYPE (grtop
), grtop
, t
);
12032 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12034 /* Emit code to initialize VRTOP, the top of the VR save area.
12035 This address is gr_save_area_bytes below GRTOP, rounded
12036 down to the next 16-byte boundary. */
12037 t
= make_tree (TREE_TYPE (vrtop
), virtual_incoming_args_rtx
);
12038 vr_offset
= ROUND_UP (gr_save_area_size
,
12039 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12042 t
= fold_build_pointer_plus_hwi (t
, -vr_offset
);
12043 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vrtop
), vrtop
, t
);
12044 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12046 /* Emit code to initialize GROFF, the offset from GRTOP of the
12047 next GPR argument. */
12048 t
= build2 (MODIFY_EXPR
, TREE_TYPE (groff
), groff
,
12049 build_int_cst (TREE_TYPE (groff
), -gr_save_area_size
));
12050 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12052 /* Likewise emit code to initialize VROFF, the offset from FTOP
12053 of the next VR argument. */
12054 t
= build2 (MODIFY_EXPR
, TREE_TYPE (vroff
), vroff
,
12055 build_int_cst (TREE_TYPE (vroff
), -vr_save_area_size
));
12056 expand_expr (t
, const0_rtx
, VOIDmode
, EXPAND_NORMAL
);
12059 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12062 aarch64_gimplify_va_arg_expr (tree valist
, tree type
, gimple_seq
*pre_p
,
12063 gimple_seq
*post_p ATTRIBUTE_UNUSED
)
12067 bool is_ha
; /* is HFA or HVA. */
12068 bool dw_align
; /* double-word align. */
12069 machine_mode ag_mode
= VOIDmode
;
12073 tree f_stack
, f_grtop
, f_vrtop
, f_groff
, f_vroff
;
12074 tree stack
, f_top
, f_off
, off
, arg
, roundup
, on_stack
;
12075 HOST_WIDE_INT size
, rsize
, adjust
, align
;
12076 tree t
, u
, cond1
, cond2
;
12078 indirect_p
= pass_by_reference (NULL
, TYPE_MODE (type
), type
, false);
12080 type
= build_pointer_type (type
);
12082 mode
= TYPE_MODE (type
);
12084 f_stack
= TYPE_FIELDS (va_list_type_node
);
12085 f_grtop
= DECL_CHAIN (f_stack
);
12086 f_vrtop
= DECL_CHAIN (f_grtop
);
12087 f_groff
= DECL_CHAIN (f_vrtop
);
12088 f_vroff
= DECL_CHAIN (f_groff
);
12090 stack
= build3 (COMPONENT_REF
, TREE_TYPE (f_stack
), unshare_expr (valist
),
12091 f_stack
, NULL_TREE
);
12092 size
= int_size_in_bytes (type
);
12093 align
= aarch64_function_arg_alignment (mode
, type
) / BITS_PER_UNIT
;
12097 if (aarch64_vfp_is_call_or_return_candidate (mode
,
12103 /* No frontends can create types with variable-sized modes, so we
12104 shouldn't be asked to pass or return them. */
12105 unsigned int ag_size
= GET_MODE_SIZE (ag_mode
).to_constant ();
12107 /* TYPE passed in fp/simd registers. */
12109 aarch64_err_no_fpadvsimd (mode
);
12111 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_vrtop
),
12112 unshare_expr (valist
), f_vrtop
, NULL_TREE
);
12113 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_vroff
),
12114 unshare_expr (valist
), f_vroff
, NULL_TREE
);
12116 rsize
= nregs
* UNITS_PER_VREG
;
12120 if (BYTES_BIG_ENDIAN
&& ag_size
< UNITS_PER_VREG
)
12121 adjust
= UNITS_PER_VREG
- ag_size
;
12123 else if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12124 && size
< UNITS_PER_VREG
)
12126 adjust
= UNITS_PER_VREG
- size
;
12131 /* TYPE passed in general registers. */
12132 f_top
= build3 (COMPONENT_REF
, TREE_TYPE (f_grtop
),
12133 unshare_expr (valist
), f_grtop
, NULL_TREE
);
12134 f_off
= build3 (COMPONENT_REF
, TREE_TYPE (f_groff
),
12135 unshare_expr (valist
), f_groff
, NULL_TREE
);
12136 rsize
= ROUND_UP (size
, UNITS_PER_WORD
);
12137 nregs
= rsize
/ UNITS_PER_WORD
;
12142 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12143 && size
< UNITS_PER_WORD
)
12145 adjust
= UNITS_PER_WORD
- size
;
12149 /* Get a local temporary for the field value. */
12150 off
= get_initialized_tmp_var (f_off
, pre_p
, NULL
);
12152 /* Emit code to branch if off >= 0. */
12153 t
= build2 (GE_EXPR
, boolean_type_node
, off
,
12154 build_int_cst (TREE_TYPE (off
), 0));
12155 cond1
= build3 (COND_EXPR
, ptr_type_node
, t
, NULL_TREE
, NULL_TREE
);
12159 /* Emit: offs = (offs + 15) & -16. */
12160 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12161 build_int_cst (TREE_TYPE (off
), 15));
12162 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (off
), t
,
12163 build_int_cst (TREE_TYPE (off
), -16));
12164 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (off
), off
, t
);
12169 /* Update ap.__[g|v]r_offs */
12170 t
= build2 (PLUS_EXPR
, TREE_TYPE (off
), off
,
12171 build_int_cst (TREE_TYPE (off
), rsize
));
12172 t
= build2 (MODIFY_EXPR
, TREE_TYPE (f_off
), unshare_expr (f_off
), t
);
12176 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12178 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12179 u
= build2 (GT_EXPR
, boolean_type_node
, unshare_expr (f_off
),
12180 build_int_cst (TREE_TYPE (f_off
), 0));
12181 cond2
= build3 (COND_EXPR
, ptr_type_node
, u
, NULL_TREE
, NULL_TREE
);
12183 /* String up: make sure the assignment happens before the use. */
12184 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (cond2
), t
, cond2
);
12185 COND_EXPR_ELSE (cond1
) = t
;
12187 /* Prepare the trees handling the argument that is passed on the stack;
12188 the top level node will store in ON_STACK. */
12189 arg
= get_initialized_tmp_var (stack
, pre_p
, NULL
);
12192 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12193 t
= fold_build_pointer_plus_hwi (arg
, 15);
12194 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12195 build_int_cst (TREE_TYPE (t
), -16));
12196 roundup
= build2 (MODIFY_EXPR
, TREE_TYPE (arg
), arg
, t
);
12200 /* Advance ap.__stack */
12201 t
= fold_build_pointer_plus_hwi (arg
, size
+ 7);
12202 t
= build2 (BIT_AND_EXPR
, TREE_TYPE (t
), t
,
12203 build_int_cst (TREE_TYPE (t
), -8));
12204 t
= build2 (MODIFY_EXPR
, TREE_TYPE (stack
), unshare_expr (stack
), t
);
12205 /* String up roundup and advance. */
12207 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), roundup
, t
);
12208 /* String up with arg */
12209 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), t
, arg
);
12210 /* Big-endianness related address adjustment. */
12211 if (BLOCK_REG_PADDING (mode
, type
, 1) == PAD_DOWNWARD
12212 && size
< UNITS_PER_WORD
)
12214 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (arg
), arg
,
12215 size_int (UNITS_PER_WORD
- size
));
12216 on_stack
= build2 (COMPOUND_EXPR
, TREE_TYPE (arg
), on_stack
, t
);
12219 COND_EXPR_THEN (cond1
) = unshare_expr (on_stack
);
12220 COND_EXPR_THEN (cond2
) = unshare_expr (on_stack
);
12222 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12225 t
= build2 (PREINCREMENT_EXPR
, TREE_TYPE (off
), off
,
12226 build_int_cst (TREE_TYPE (off
), adjust
));
12228 t
= fold_convert (sizetype
, t
);
12229 t
= build2 (POINTER_PLUS_EXPR
, TREE_TYPE (f_top
), f_top
, t
);
12233 /* type ha; // treat as "struct {ftype field[n];}"
12234 ... [computing offs]
12235 for (i = 0; i <nregs; ++i, offs += 16)
12236 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12239 tree tmp_ha
, field_t
, field_ptr_t
;
12241 /* Declare a local variable. */
12242 tmp_ha
= create_tmp_var_raw (type
, "ha");
12243 gimple_add_tmp_var (tmp_ha
);
12245 /* Establish the base type. */
12249 field_t
= float_type_node
;
12250 field_ptr_t
= float_ptr_type_node
;
12253 field_t
= double_type_node
;
12254 field_ptr_t
= double_ptr_type_node
;
12257 field_t
= long_double_type_node
;
12258 field_ptr_t
= long_double_ptr_type_node
;
12261 field_t
= aarch64_fp16_type_node
;
12262 field_ptr_t
= aarch64_fp16_ptr_type_node
;
12267 tree innertype
= make_signed_type (GET_MODE_PRECISION (SImode
));
12268 field_t
= build_vector_type_for_mode (innertype
, ag_mode
);
12269 field_ptr_t
= build_pointer_type (field_t
);
12276 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12277 tmp_ha
= build1 (ADDR_EXPR
, field_ptr_t
, tmp_ha
);
12279 t
= fold_convert (field_ptr_t
, addr
);
12280 t
= build2 (MODIFY_EXPR
, field_t
,
12281 build1 (INDIRECT_REF
, field_t
, tmp_ha
),
12282 build1 (INDIRECT_REF
, field_t
, t
));
12284 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12285 for (i
= 1; i
< nregs
; ++i
)
12287 addr
= fold_build_pointer_plus_hwi (addr
, UNITS_PER_VREG
);
12288 u
= fold_convert (field_ptr_t
, addr
);
12289 u
= build2 (MODIFY_EXPR
, field_t
,
12290 build2 (MEM_REF
, field_t
, tmp_ha
,
12291 build_int_cst (field_ptr_t
,
12293 int_size_in_bytes (field_t
)))),
12294 build1 (INDIRECT_REF
, field_t
, u
));
12295 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (t
), t
, u
);
12298 u
= fold_convert (TREE_TYPE (f_top
), tmp_ha
);
12299 t
= build2 (COMPOUND_EXPR
, TREE_TYPE (f_top
), t
, u
);
12302 COND_EXPR_ELSE (cond2
) = t
;
12303 addr
= fold_convert (build_pointer_type (type
), cond1
);
12304 addr
= build_va_arg_indirect_ref (addr
);
12307 addr
= build_va_arg_indirect_ref (addr
);
12312 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12315 aarch64_setup_incoming_varargs (cumulative_args_t cum_v
, machine_mode mode
,
12316 tree type
, int *pretend_size ATTRIBUTE_UNUSED
,
12319 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
12320 CUMULATIVE_ARGS local_cum
;
12321 int gr_saved
= cfun
->va_list_gpr_size
;
12322 int vr_saved
= cfun
->va_list_fpr_size
;
12324 /* The caller has advanced CUM up to, but not beyond, the last named
12325 argument. Advance a local copy of CUM past the last "real" named
12326 argument, to find out how many registers are left over. */
12328 aarch64_function_arg_advance (pack_cumulative_args(&local_cum
), mode
, type
, true);
12330 /* Found out how many registers we need to save.
12331 Honor tree-stdvar analysis results. */
12332 if (cfun
->va_list_gpr_size
)
12333 gr_saved
= MIN (NUM_ARG_REGS
- local_cum
.aapcs_ncrn
,
12334 cfun
->va_list_gpr_size
/ UNITS_PER_WORD
);
12335 if (cfun
->va_list_fpr_size
)
12336 vr_saved
= MIN (NUM_FP_ARG_REGS
- local_cum
.aapcs_nvrn
,
12337 cfun
->va_list_fpr_size
/ UNITS_PER_VREG
);
12341 gcc_assert (local_cum
.aapcs_nvrn
== 0);
12351 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12352 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
,
12353 - gr_saved
* UNITS_PER_WORD
);
12354 mem
= gen_frame_mem (BLKmode
, ptr
);
12355 set_mem_alias_set (mem
, get_varargs_alias_set ());
12357 move_block_from_reg (local_cum
.aapcs_ncrn
+ R0_REGNUM
,
12362 /* We can't use move_block_from_reg, because it will use
12363 the wrong mode, storing D regs only. */
12364 machine_mode mode
= TImode
;
12365 int off
, i
, vr_start
;
12367 /* Set OFF to the offset from virtual_incoming_args_rtx of
12368 the first vector register. The VR save area lies below
12369 the GR one, and is aligned to 16 bytes. */
12370 off
= -ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12371 STACK_BOUNDARY
/ BITS_PER_UNIT
);
12372 off
-= vr_saved
* UNITS_PER_VREG
;
12374 vr_start
= V0_REGNUM
+ local_cum
.aapcs_nvrn
;
12375 for (i
= 0; i
< vr_saved
; ++i
)
12379 ptr
= plus_constant (Pmode
, virtual_incoming_args_rtx
, off
);
12380 mem
= gen_frame_mem (mode
, ptr
);
12381 set_mem_alias_set (mem
, get_varargs_alias_set ());
12382 aarch64_emit_move (mem
, gen_rtx_REG (mode
, vr_start
+ i
));
12383 off
+= UNITS_PER_VREG
;
12388 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12389 any complication of having crtl->args.pretend_args_size changed. */
12390 cfun
->machine
->frame
.saved_varargs_size
12391 = (ROUND_UP (gr_saved
* UNITS_PER_WORD
,
12392 STACK_BOUNDARY
/ BITS_PER_UNIT
)
12393 + vr_saved
* UNITS_PER_VREG
);
12397 aarch64_conditional_register_usage (void)
12402 for (i
= V0_REGNUM
; i
<= V31_REGNUM
; i
++)
12405 call_used_regs
[i
] = 1;
12409 for (i
= P0_REGNUM
; i
<= P15_REGNUM
; i
++)
12412 call_used_regs
[i
] = 1;
12415 /* When tracking speculation, we need a couple of call-clobbered registers
12416 to track the speculation state. It would be nice to just use
12417 IP0 and IP1, but currently there are numerous places that just
12418 assume these registers are free for other uses (eg pointer
12419 authentication). */
12420 if (aarch64_track_speculation
)
12422 fixed_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
12423 call_used_regs
[SPECULATION_TRACKER_REGNUM
] = 1;
12424 fixed_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
12425 call_used_regs
[SPECULATION_SCRATCH_REGNUM
] = 1;
12429 /* Walk down the type tree of TYPE counting consecutive base elements.
12430 If *MODEP is VOIDmode, then set it to the first valid floating point
12431 type. If a non-floating point type is found, or if a floating point
12432 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12433 otherwise return the count in the sub-tree. */
12435 aapcs_vfp_sub_candidate (const_tree type
, machine_mode
*modep
)
12438 HOST_WIDE_INT size
;
12440 switch (TREE_CODE (type
))
12443 mode
= TYPE_MODE (type
);
12444 if (mode
!= DFmode
&& mode
!= SFmode
12445 && mode
!= TFmode
&& mode
!= HFmode
)
12448 if (*modep
== VOIDmode
)
12451 if (*modep
== mode
)
12457 mode
= TYPE_MODE (TREE_TYPE (type
));
12458 if (mode
!= DFmode
&& mode
!= SFmode
12459 && mode
!= TFmode
&& mode
!= HFmode
)
12462 if (*modep
== VOIDmode
)
12465 if (*modep
== mode
)
12471 /* Use V2SImode and V4SImode as representatives of all 64-bit
12472 and 128-bit vector types. */
12473 size
= int_size_in_bytes (type
);
12486 if (*modep
== VOIDmode
)
12489 /* Vector modes are considered to be opaque: two vectors are
12490 equivalent for the purposes of being homogeneous aggregates
12491 if they are the same size. */
12492 if (*modep
== mode
)
12500 tree index
= TYPE_DOMAIN (type
);
12502 /* Can't handle incomplete types nor sizes that are not
12504 if (!COMPLETE_TYPE_P (type
)
12505 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12508 count
= aapcs_vfp_sub_candidate (TREE_TYPE (type
), modep
);
12511 || !TYPE_MAX_VALUE (index
)
12512 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index
))
12513 || !TYPE_MIN_VALUE (index
)
12514 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index
))
12518 count
*= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index
))
12519 - tree_to_uhwi (TYPE_MIN_VALUE (index
)));
12521 /* There must be no padding. */
12522 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12523 count
* GET_MODE_BITSIZE (*modep
)))
12535 /* Can't handle incomplete types nor sizes that are not
12537 if (!COMPLETE_TYPE_P (type
)
12538 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12541 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12543 if (TREE_CODE (field
) != FIELD_DECL
)
12546 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12549 count
+= sub_count
;
12552 /* There must be no padding. */
12553 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12554 count
* GET_MODE_BITSIZE (*modep
)))
12561 case QUAL_UNION_TYPE
:
12563 /* These aren't very interesting except in a degenerate case. */
12568 /* Can't handle incomplete types nor sizes that are not
12570 if (!COMPLETE_TYPE_P (type
)
12571 || TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
12574 for (field
= TYPE_FIELDS (type
); field
; field
= TREE_CHAIN (field
))
12576 if (TREE_CODE (field
) != FIELD_DECL
)
12579 sub_count
= aapcs_vfp_sub_candidate (TREE_TYPE (field
), modep
);
12582 count
= count
> sub_count
? count
: sub_count
;
12585 /* There must be no padding. */
12586 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type
)),
12587 count
* GET_MODE_BITSIZE (*modep
)))
12600 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12601 type as described in AAPCS64 \S 4.1.2.
12603 See the comment above aarch64_composite_type_p for the notes on MODE. */
12606 aarch64_short_vector_p (const_tree type
,
12609 poly_int64 size
= -1;
12611 if (type
&& TREE_CODE (type
) == VECTOR_TYPE
)
12612 size
= int_size_in_bytes (type
);
12613 else if (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
12614 || GET_MODE_CLASS (mode
) == MODE_VECTOR_FLOAT
)
12615 size
= GET_MODE_SIZE (mode
);
12617 return known_eq (size
, 8) || known_eq (size
, 16);
12620 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12621 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12622 array types. The C99 floating-point complex types are also considered
12623 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12624 types, which are GCC extensions and out of the scope of AAPCS64, are
12625 treated as composite types here as well.
12627 Note that MODE itself is not sufficient in determining whether a type
12628 is such a composite type or not. This is because
12629 stor-layout.c:compute_record_mode may have already changed the MODE
12630 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12631 structure with only one field may have its MODE set to the mode of the
12632 field. Also an integer mode whose size matches the size of the
12633 RECORD_TYPE type may be used to substitute the original mode
12634 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12635 solely relied on. */
12638 aarch64_composite_type_p (const_tree type
,
12641 if (aarch64_short_vector_p (type
, mode
))
12644 if (type
&& (AGGREGATE_TYPE_P (type
) || TREE_CODE (type
) == COMPLEX_TYPE
))
12647 if (mode
== BLKmode
12648 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
12649 || GET_MODE_CLASS (mode
) == MODE_COMPLEX_INT
)
12655 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12656 shall be passed or returned in simd/fp register(s) (providing these
12657 parameter passing registers are available).
12659 Upon successful return, *COUNT returns the number of needed registers,
12660 *BASE_MODE returns the mode of the individual register and when IS_HAF
12661 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12662 floating-point aggregate or a homogeneous short-vector aggregate. */
12665 aarch64_vfp_is_call_or_return_candidate (machine_mode mode
,
12667 machine_mode
*base_mode
,
12671 machine_mode new_mode
= VOIDmode
;
12672 bool composite_p
= aarch64_composite_type_p (type
, mode
);
12674 if (is_ha
!= NULL
) *is_ha
= false;
12676 if ((!composite_p
&& GET_MODE_CLASS (mode
) == MODE_FLOAT
)
12677 || aarch64_short_vector_p (type
, mode
))
12682 else if (GET_MODE_CLASS (mode
) == MODE_COMPLEX_FLOAT
)
12684 if (is_ha
!= NULL
) *is_ha
= true;
12686 new_mode
= GET_MODE_INNER (mode
);
12688 else if (type
&& composite_p
)
12690 int ag_count
= aapcs_vfp_sub_candidate (type
, &new_mode
);
12692 if (ag_count
> 0 && ag_count
<= HA_MAX_NUM_FLDS
)
12694 if (is_ha
!= NULL
) *is_ha
= true;
12703 *base_mode
= new_mode
;
12707 /* Implement TARGET_STRUCT_VALUE_RTX. */
12710 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED
,
12711 int incoming ATTRIBUTE_UNUSED
)
12713 return gen_rtx_REG (Pmode
, AARCH64_STRUCT_VALUE_REGNUM
);
12716 /* Implements target hook vector_mode_supported_p. */
12718 aarch64_vector_mode_supported_p (machine_mode mode
)
12720 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
12721 return vec_flags
!= 0 && (vec_flags
& VEC_STRUCT
) == 0;
12724 /* Return appropriate SIMD container
12725 for MODE within a vector of WIDTH bits. */
12726 static machine_mode
12727 aarch64_simd_container_mode (scalar_mode mode
, poly_int64 width
)
12729 if (TARGET_SVE
&& known_eq (width
, BITS_PER_SVE_VECTOR
))
12745 return VNx16QImode
;
12750 gcc_assert (known_eq (width
, 64) || known_eq (width
, 128));
12753 if (known_eq (width
, 128))
12793 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12794 static machine_mode
12795 aarch64_preferred_simd_mode (scalar_mode mode
)
12797 poly_int64 bits
= TARGET_SVE
? BITS_PER_SVE_VECTOR
: 128;
12798 return aarch64_simd_container_mode (mode
, bits
);
12801 /* Return a list of possible vector sizes for the vectorizer
12802 to iterate over. */
12804 aarch64_autovectorize_vector_sizes (vector_sizes
*sizes
)
12807 sizes
->safe_push (BYTES_PER_SVE_VECTOR
);
12808 sizes
->safe_push (16);
12809 sizes
->safe_push (8);
12812 /* Implement TARGET_MANGLE_TYPE. */
12814 static const char *
12815 aarch64_mangle_type (const_tree type
)
12817 /* The AArch64 ABI documents say that "__va_list" has to be
12818 managled as if it is in the "std" namespace. */
12819 if (lang_hooks
.types_compatible_p (CONST_CAST_TREE (type
), va_list_type
))
12820 return "St9__va_list";
12822 /* Half-precision float. */
12823 if (TREE_CODE (type
) == REAL_TYPE
&& TYPE_PRECISION (type
) == 16)
12826 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12828 if (TYPE_NAME (type
) != NULL
)
12829 return aarch64_mangle_builtin_type (type
);
12831 /* Use the default mangling. */
12835 /* Find the first rtx_insn before insn that will generate an assembly
12839 aarch64_prev_real_insn (rtx_insn
*insn
)
12846 insn
= prev_real_insn (insn
);
12848 while (insn
&& recog_memoized (insn
) < 0);
12854 is_madd_op (enum attr_type t1
)
12857 /* A number of these may be AArch32 only. */
12858 enum attr_type mlatypes
[] = {
12859 TYPE_MLA
, TYPE_MLAS
, TYPE_SMLAD
, TYPE_SMLADX
, TYPE_SMLAL
, TYPE_SMLALD
,
12860 TYPE_SMLALS
, TYPE_SMLALXY
, TYPE_SMLAWX
, TYPE_SMLAWY
, TYPE_SMLAXY
,
12861 TYPE_SMMLA
, TYPE_UMLAL
, TYPE_UMLALS
,TYPE_SMLSD
, TYPE_SMLSDX
, TYPE_SMLSLD
12864 for (i
= 0; i
< sizeof (mlatypes
) / sizeof (enum attr_type
); i
++)
12866 if (t1
== mlatypes
[i
])
12873 /* Check if there is a register dependency between a load and the insn
12874 for which we hold recog_data. */
12877 dep_between_memop_and_curr (rtx memop
)
12882 gcc_assert (GET_CODE (memop
) == SET
);
12884 if (!REG_P (SET_DEST (memop
)))
12887 load_reg
= SET_DEST (memop
);
12888 for (opno
= 1; opno
< recog_data
.n_operands
; opno
++)
12890 rtx operand
= recog_data
.operand
[opno
];
12891 if (REG_P (operand
)
12892 && reg_overlap_mentioned_p (load_reg
, operand
))
12900 /* When working around the Cortex-A53 erratum 835769,
12901 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12902 instruction and has a preceding memory instruction such that a NOP
12903 should be inserted between them. */
12906 aarch64_madd_needs_nop (rtx_insn
* insn
)
12908 enum attr_type attr_type
;
12912 if (!TARGET_FIX_ERR_A53_835769
)
12915 if (!INSN_P (insn
) || recog_memoized (insn
) < 0)
12918 attr_type
= get_attr_type (insn
);
12919 if (!is_madd_op (attr_type
))
12922 prev
= aarch64_prev_real_insn (insn
);
12923 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12924 Restore recog state to INSN to avoid state corruption. */
12925 extract_constrain_insn_cached (insn
);
12927 if (!prev
|| !contains_mem_rtx_p (PATTERN (prev
)))
12930 body
= single_set (prev
);
12932 /* If the previous insn is a memory op and there is no dependency between
12933 it and the DImode madd, emit a NOP between them. If body is NULL then we
12934 have a complex memory operation, probably a load/store pair.
12935 Be conservative for now and emit a NOP. */
12936 if (GET_MODE (recog_data
.operand
[0]) == DImode
12937 && (!body
|| !dep_between_memop_and_curr (body
)))
12945 /* Implement FINAL_PRESCAN_INSN. */
12948 aarch64_final_prescan_insn (rtx_insn
*insn
)
12950 if (aarch64_madd_needs_nop (insn
))
12951 fprintf (asm_out_file
, "\tnop // between mem op and mult-accumulate\n");
12955 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12959 aarch64_sve_index_immediate_p (rtx base_or_step
)
12961 return (CONST_INT_P (base_or_step
)
12962 && IN_RANGE (INTVAL (base_or_step
), -16, 15));
12965 /* Return true if X is a valid immediate for the SVE ADD and SUB
12966 instructions. Negate X first if NEGATE_P is true. */
12969 aarch64_sve_arith_immediate_p (rtx x
, bool negate_p
)
12973 if (!const_vec_duplicate_p (x
, &elt
)
12974 || !CONST_INT_P (elt
))
12977 HOST_WIDE_INT val
= INTVAL (elt
);
12980 val
&= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x
)));
12983 return IN_RANGE (val
, 0, 0xff);
12984 return IN_RANGE (val
, 0, 0xff00);
12987 /* Return true if X is a valid immediate operand for an SVE logical
12988 instruction such as AND. */
12991 aarch64_sve_bitmask_immediate_p (rtx x
)
12995 return (const_vec_duplicate_p (x
, &elt
)
12996 && CONST_INT_P (elt
)
12997 && aarch64_bitmask_imm (INTVAL (elt
),
12998 GET_MODE_INNER (GET_MODE (x
))));
13001 /* Return true if X is a valid immediate for the SVE DUP and CPY
13005 aarch64_sve_dup_immediate_p (rtx x
)
13009 if (!const_vec_duplicate_p (x
, &elt
)
13010 || !CONST_INT_P (elt
))
13013 HOST_WIDE_INT val
= INTVAL (elt
);
13015 return IN_RANGE (val
, -0x80, 0x7f);
13016 return IN_RANGE (val
, -0x8000, 0x7f00);
13019 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13020 SIGNED_P says whether the operand is signed rather than unsigned. */
13023 aarch64_sve_cmp_immediate_p (rtx x
, bool signed_p
)
13027 return (const_vec_duplicate_p (x
, &elt
)
13028 && CONST_INT_P (elt
)
13030 ? IN_RANGE (INTVAL (elt
), -16, 15)
13031 : IN_RANGE (INTVAL (elt
), 0, 127)));
13034 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13035 instruction. Negate X first if NEGATE_P is true. */
13038 aarch64_sve_float_arith_immediate_p (rtx x
, bool negate_p
)
13043 if (!const_vec_duplicate_p (x
, &elt
)
13044 || GET_CODE (elt
) != CONST_DOUBLE
)
13047 r
= *CONST_DOUBLE_REAL_VALUE (elt
);
13050 r
= real_value_negate (&r
);
13052 if (real_equal (&r
, &dconst1
))
13054 if (real_equal (&r
, &dconsthalf
))
13059 /* Return true if X is a valid immediate operand for an SVE FMUL
13063 aarch64_sve_float_mul_immediate_p (rtx x
)
13067 /* GCC will never generate a multiply with an immediate of 2, so there is no
13068 point testing for it (even though it is a valid constant). */
13069 return (const_vec_duplicate_p (x
, &elt
)
13070 && GET_CODE (elt
) == CONST_DOUBLE
13071 && real_equal (CONST_DOUBLE_REAL_VALUE (elt
), &dconsthalf
));
13074 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13075 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13076 is nonnull, use it to describe valid immediates. */
13078 aarch64_advsimd_valid_immediate_hs (unsigned int val32
,
13079 simd_immediate_info
*info
,
13080 enum simd_immediate_check which
,
13081 simd_immediate_info::insn_type insn
)
13083 /* Try a 4-byte immediate with LSL. */
13084 for (unsigned int shift
= 0; shift
< 32; shift
+= 8)
13085 if ((val32
& (0xff << shift
)) == val32
)
13088 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13089 simd_immediate_info::LSL
, shift
);
13093 /* Try a 2-byte immediate with LSL. */
13094 unsigned int imm16
= val32
& 0xffff;
13095 if (imm16
== (val32
>> 16))
13096 for (unsigned int shift
= 0; shift
< 16; shift
+= 8)
13097 if ((imm16
& (0xff << shift
)) == imm16
)
13100 *info
= simd_immediate_info (HImode
, imm16
>> shift
, insn
,
13101 simd_immediate_info::LSL
, shift
);
13105 /* Try a 4-byte immediate with MSL, except for cases that MVN
13107 if (which
== AARCH64_CHECK_MOV
)
13108 for (unsigned int shift
= 8; shift
< 24; shift
+= 8)
13110 unsigned int low
= (1 << shift
) - 1;
13111 if (((val32
& (0xff << shift
)) | low
) == val32
)
13114 *info
= simd_immediate_info (SImode
, val32
>> shift
, insn
,
13115 simd_immediate_info::MSL
, shift
);
13123 /* Return true if replicating VAL64 is a valid immediate for the
13124 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13125 use it to describe valid immediates. */
13127 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64
,
13128 simd_immediate_info
*info
,
13129 enum simd_immediate_check which
)
13131 unsigned int val32
= val64
& 0xffffffff;
13132 unsigned int val16
= val64
& 0xffff;
13133 unsigned int val8
= val64
& 0xff;
13135 if (val32
== (val64
>> 32))
13137 if ((which
& AARCH64_CHECK_ORR
) != 0
13138 && aarch64_advsimd_valid_immediate_hs (val32
, info
, which
,
13139 simd_immediate_info::MOV
))
13142 if ((which
& AARCH64_CHECK_BIC
) != 0
13143 && aarch64_advsimd_valid_immediate_hs (~val32
, info
, which
,
13144 simd_immediate_info::MVN
))
13147 /* Try using a replicated byte. */
13148 if (which
== AARCH64_CHECK_MOV
13149 && val16
== (val32
>> 16)
13150 && val8
== (val16
>> 8))
13153 *info
= simd_immediate_info (QImode
, val8
);
13158 /* Try using a bit-to-bytemask. */
13159 if (which
== AARCH64_CHECK_MOV
)
13162 for (i
= 0; i
< 64; i
+= 8)
13164 unsigned char byte
= (val64
>> i
) & 0xff;
13165 if (byte
!= 0 && byte
!= 0xff)
13171 *info
= simd_immediate_info (DImode
, val64
);
13178 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13179 instruction. If INFO is nonnull, use it to describe valid immediates. */
13182 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64
,
13183 simd_immediate_info
*info
)
13185 scalar_int_mode mode
= DImode
;
13186 unsigned int val32
= val64
& 0xffffffff;
13187 if (val32
== (val64
>> 32))
13190 unsigned int val16
= val32
& 0xffff;
13191 if (val16
== (val32
>> 16))
13194 unsigned int val8
= val16
& 0xff;
13195 if (val8
== (val16
>> 8))
13199 HOST_WIDE_INT val
= trunc_int_for_mode (val64
, mode
);
13200 if (IN_RANGE (val
, -0x80, 0x7f))
13202 /* DUP with no shift. */
13204 *info
= simd_immediate_info (mode
, val
);
13207 if ((val
& 0xff) == 0 && IN_RANGE (val
, -0x8000, 0x7f00))
13209 /* DUP with LSL #8. */
13211 *info
= simd_immediate_info (mode
, val
);
13214 if (aarch64_bitmask_imm (val64
, mode
))
13218 *info
= simd_immediate_info (mode
, val
);
13224 /* Return true if OP is a valid SIMD immediate for the operation
13225 described by WHICH. If INFO is nonnull, use it to describe valid
13228 aarch64_simd_valid_immediate (rtx op
, simd_immediate_info
*info
,
13229 enum simd_immediate_check which
)
13231 machine_mode mode
= GET_MODE (op
);
13232 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
13233 if (vec_flags
== 0 || vec_flags
== (VEC_ADVSIMD
| VEC_STRUCT
))
13236 scalar_mode elt_mode
= GET_MODE_INNER (mode
);
13238 unsigned int n_elts
;
13239 if (GET_CODE (op
) == CONST_VECTOR
13240 && CONST_VECTOR_DUPLICATE_P (op
))
13241 n_elts
= CONST_VECTOR_NPATTERNS (op
);
13242 else if ((vec_flags
& VEC_SVE_DATA
)
13243 && const_vec_series_p (op
, &base
, &step
))
13245 gcc_assert (GET_MODE_CLASS (mode
) == MODE_VECTOR_INT
);
13246 if (!aarch64_sve_index_immediate_p (base
)
13247 || !aarch64_sve_index_immediate_p (step
))
13251 *info
= simd_immediate_info (elt_mode
, base
, step
);
13254 else if (GET_CODE (op
) == CONST_VECTOR
13255 && CONST_VECTOR_NUNITS (op
).is_constant (&n_elts
))
13256 /* N_ELTS set above. */;
13260 /* Handle PFALSE and PTRUE. */
13261 if (vec_flags
& VEC_SVE_PRED
)
13262 return (op
== CONST0_RTX (mode
)
13263 || op
== CONSTM1_RTX (mode
));
13265 scalar_float_mode elt_float_mode
;
13267 && is_a
<scalar_float_mode
> (elt_mode
, &elt_float_mode
))
13269 rtx elt
= CONST_VECTOR_ENCODED_ELT (op
, 0);
13270 if (aarch64_float_const_zero_rtx_p (elt
)
13271 || aarch64_float_const_representable_p (elt
))
13274 *info
= simd_immediate_info (elt_float_mode
, elt
);
13279 unsigned int elt_size
= GET_MODE_SIZE (elt_mode
);
13283 scalar_int_mode elt_int_mode
= int_mode_for_mode (elt_mode
).require ();
13285 /* Expand the vector constant out into a byte vector, with the least
13286 significant byte of the register first. */
13287 auto_vec
<unsigned char, 16> bytes
;
13288 bytes
.reserve (n_elts
* elt_size
);
13289 for (unsigned int i
= 0; i
< n_elts
; i
++)
13291 /* The vector is provided in gcc endian-neutral fashion.
13292 For aarch64_be Advanced SIMD, it must be laid out in the vector
13293 register in reverse order. */
13294 bool swap_p
= ((vec_flags
& VEC_ADVSIMD
) != 0 && BYTES_BIG_ENDIAN
);
13295 rtx elt
= CONST_VECTOR_ELT (op
, swap_p
? (n_elts
- 1 - i
) : i
);
13297 if (elt_mode
!= elt_int_mode
)
13298 elt
= gen_lowpart (elt_int_mode
, elt
);
13300 if (!CONST_INT_P (elt
))
13303 unsigned HOST_WIDE_INT elt_val
= INTVAL (elt
);
13304 for (unsigned int byte
= 0; byte
< elt_size
; byte
++)
13306 bytes
.quick_push (elt_val
& 0xff);
13307 elt_val
>>= BITS_PER_UNIT
;
13311 /* The immediate must repeat every eight bytes. */
13312 unsigned int nbytes
= bytes
.length ();
13313 for (unsigned i
= 8; i
< nbytes
; ++i
)
13314 if (bytes
[i
] != bytes
[i
- 8])
13317 /* Get the repeating 8-byte value as an integer. No endian correction
13318 is needed here because bytes is already in lsb-first order. */
13319 unsigned HOST_WIDE_INT val64
= 0;
13320 for (unsigned int i
= 0; i
< 8; i
++)
13321 val64
|= ((unsigned HOST_WIDE_INT
) bytes
[i
% nbytes
]
13322 << (i
* BITS_PER_UNIT
));
13324 if (vec_flags
& VEC_SVE_DATA
)
13325 return aarch64_sve_valid_immediate (val64
, info
);
13327 return aarch64_advsimd_valid_immediate (val64
, info
, which
);
13330 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13331 has a step in the range of INDEX. Return the index expression if so,
13332 otherwise return null. */
13334 aarch64_check_zero_based_sve_index_immediate (rtx x
)
13337 if (const_vec_series_p (x
, &base
, &step
)
13338 && base
== const0_rtx
13339 && aarch64_sve_index_immediate_p (step
))
13344 /* Check of immediate shift constants are within range. */
13346 aarch64_simd_shift_imm_p (rtx x
, machine_mode mode
, bool left
)
13348 int bit_width
= GET_MODE_UNIT_SIZE (mode
) * BITS_PER_UNIT
;
13350 return aarch64_const_vec_all_same_in_range_p (x
, 0, bit_width
- 1);
13352 return aarch64_const_vec_all_same_in_range_p (x
, 1, bit_width
);
13355 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13356 operation of width WIDTH at bit position POS. */
13359 aarch64_mask_from_zextract_ops (rtx width
, rtx pos
)
13361 gcc_assert (CONST_INT_P (width
));
13362 gcc_assert (CONST_INT_P (pos
));
13364 unsigned HOST_WIDE_INT mask
13365 = ((unsigned HOST_WIDE_INT
) 1 << UINTVAL (width
)) - 1;
13366 return GEN_INT (mask
<< UINTVAL (pos
));
13370 aarch64_mov_operand_p (rtx x
, machine_mode mode
)
13372 if (GET_CODE (x
) == HIGH
13373 && aarch64_valid_symref (XEXP (x
, 0), GET_MODE (XEXP (x
, 0))))
13376 if (CONST_INT_P (x
))
13379 if (VECTOR_MODE_P (GET_MODE (x
)))
13380 return aarch64_simd_valid_immediate (x
, NULL
);
13382 if (GET_CODE (x
) == SYMBOL_REF
&& mode
== DImode
&& CONSTANT_ADDRESS_P (x
))
13385 if (aarch64_sve_cnt_immediate_p (x
))
13388 return aarch64_classify_symbolic_expression (x
)
13389 == SYMBOL_TINY_ABSOLUTE
;
13392 /* Return a const_int vector of VAL. */
13394 aarch64_simd_gen_const_vector_dup (machine_mode mode
, HOST_WIDE_INT val
)
13396 rtx c
= gen_int_mode (val
, GET_MODE_INNER (mode
));
13397 return gen_const_vec_duplicate (mode
, c
);
13400 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13403 aarch64_simd_scalar_immediate_valid_for_move (rtx op
, scalar_int_mode mode
)
13405 machine_mode vmode
;
13407 vmode
= aarch64_simd_container_mode (mode
, 64);
13408 rtx op_v
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (op
));
13409 return aarch64_simd_valid_immediate (op_v
, NULL
);
13412 /* Construct and return a PARALLEL RTX vector with elements numbering the
13413 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13414 the vector - from the perspective of the architecture. This does not
13415 line up with GCC's perspective on lane numbers, so we end up with
13416 different masks depending on our target endian-ness. The diagram
13417 below may help. We must draw the distinction when building masks
13418 which select one half of the vector. An instruction selecting
13419 architectural low-lanes for a big-endian target, must be described using
13420 a mask selecting GCC high-lanes.
13422 Big-Endian Little-Endian
13424 GCC 0 1 2 3 3 2 1 0
13425 | x | x | x | x | | x | x | x | x |
13426 Architecture 3 2 1 0 3 2 1 0
13428 Low Mask: { 2, 3 } { 0, 1 }
13429 High Mask: { 0, 1 } { 2, 3 }
13431 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13434 aarch64_simd_vect_par_cnst_half (machine_mode mode
, int nunits
, bool high
)
13436 rtvec v
= rtvec_alloc (nunits
/ 2);
13437 int high_base
= nunits
/ 2;
13443 if (BYTES_BIG_ENDIAN
)
13444 base
= high
? low_base
: high_base
;
13446 base
= high
? high_base
: low_base
;
13448 for (i
= 0; i
< nunits
/ 2; i
++)
13449 RTVEC_ELT (v
, i
) = GEN_INT (base
+ i
);
13451 t1
= gen_rtx_PARALLEL (mode
, v
);
13455 /* Check OP for validity as a PARALLEL RTX vector with elements
13456 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13457 from the perspective of the architecture. See the diagram above
13458 aarch64_simd_vect_par_cnst_half for more details. */
13461 aarch64_simd_check_vect_par_cnst_half (rtx op
, machine_mode mode
,
13465 if (!VECTOR_MODE_P (mode
) || !GET_MODE_NUNITS (mode
).is_constant (&nelts
))
13468 rtx ideal
= aarch64_simd_vect_par_cnst_half (mode
, nelts
, high
);
13469 HOST_WIDE_INT count_op
= XVECLEN (op
, 0);
13470 HOST_WIDE_INT count_ideal
= XVECLEN (ideal
, 0);
13473 if (count_op
!= count_ideal
)
13476 for (i
= 0; i
< count_ideal
; i
++)
13478 rtx elt_op
= XVECEXP (op
, 0, i
);
13479 rtx elt_ideal
= XVECEXP (ideal
, 0, i
);
13481 if (!CONST_INT_P (elt_op
)
13482 || INTVAL (elt_ideal
) != INTVAL (elt_op
))
13488 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13489 HIGH (exclusive). */
13491 aarch64_simd_lane_bounds (rtx operand
, HOST_WIDE_INT low
, HOST_WIDE_INT high
,
13494 HOST_WIDE_INT lane
;
13495 gcc_assert (CONST_INT_P (operand
));
13496 lane
= INTVAL (operand
);
13498 if (lane
< low
|| lane
>= high
)
13501 error ("%Klane %wd out of range %wd - %wd", exp
, lane
, low
, high
- 1);
13503 error ("lane %wd out of range %wd - %wd", lane
, low
, high
- 1);
13507 /* Peform endian correction on lane number N, which indexes a vector
13508 of mode MODE, and return the result as an SImode rtx. */
13511 aarch64_endian_lane_rtx (machine_mode mode
, unsigned int n
)
13513 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode
), n
), SImode
);
13516 /* Return TRUE if OP is a valid vector addressing mode. */
13519 aarch64_simd_mem_operand_p (rtx op
)
13521 return MEM_P (op
) && (GET_CODE (XEXP (op
, 0)) == POST_INC
13522 || REG_P (XEXP (op
, 0)));
13525 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13528 aarch64_sve_ld1r_operand_p (rtx op
)
13530 struct aarch64_address_info addr
;
13534 && is_a
<scalar_mode
> (GET_MODE (op
), &mode
)
13535 && aarch64_classify_address (&addr
, XEXP (op
, 0), mode
, false)
13536 && addr
.type
== ADDRESS_REG_IMM
13537 && offset_6bit_unsigned_scaled_p (mode
, addr
.const_offset
));
13540 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13541 The conditions for STR are the same. */
13543 aarch64_sve_ldr_operand_p (rtx op
)
13545 struct aarch64_address_info addr
;
13548 && aarch64_classify_address (&addr
, XEXP (op
, 0), GET_MODE (op
),
13549 false, ADDR_QUERY_ANY
)
13550 && addr
.type
== ADDRESS_REG_IMM
);
13553 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13554 We need to be able to access the individual pieces, so the range
13555 is different from LD[234] and ST[234]. */
13557 aarch64_sve_struct_memory_operand_p (rtx op
)
13562 machine_mode mode
= GET_MODE (op
);
13563 struct aarch64_address_info addr
;
13564 if (!aarch64_classify_address (&addr
, XEXP (op
, 0), SVE_BYTE_MODE
, false,
13566 || addr
.type
!= ADDRESS_REG_IMM
)
13569 poly_int64 first
= addr
.const_offset
;
13570 poly_int64 last
= first
+ GET_MODE_SIZE (mode
) - BYTES_PER_SVE_VECTOR
;
13571 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, first
)
13572 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE
, last
));
13575 /* Emit a register copy from operand to operand, taking care not to
13576 early-clobber source registers in the process.
13578 COUNT is the number of components into which the copy needs to be
13581 aarch64_simd_emit_reg_reg_move (rtx
*operands
, machine_mode mode
,
13582 unsigned int count
)
13585 int rdest
= REGNO (operands
[0]);
13586 int rsrc
= REGNO (operands
[1]);
13588 if (!reg_overlap_mentioned_p (operands
[0], operands
[1])
13590 for (i
= 0; i
< count
; i
++)
13591 emit_move_insn (gen_rtx_REG (mode
, rdest
+ i
),
13592 gen_rtx_REG (mode
, rsrc
+ i
));
13594 for (i
= 0; i
< count
; i
++)
13595 emit_move_insn (gen_rtx_REG (mode
, rdest
+ count
- i
- 1),
13596 gen_rtx_REG (mode
, rsrc
+ count
- i
- 1));
13599 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13600 one of VSTRUCT modes: OI, CI, or XI. */
13602 aarch64_simd_attr_length_rglist (machine_mode mode
)
13604 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13605 return (GET_MODE_SIZE (mode
).to_constant () / UNITS_PER_VREG
) * 4;
13608 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13609 alignment of a vector to 128 bits. SVE predicates have an alignment of
13611 static HOST_WIDE_INT
13612 aarch64_simd_vector_alignment (const_tree type
)
13614 if (TREE_CODE (TYPE_SIZE (type
)) != INTEGER_CST
)
13615 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13616 be set for non-predicate vectors of booleans. Modes are the most
13617 direct way we have of identifying real SVE predicate types. */
13618 return GET_MODE_CLASS (TYPE_MODE (type
)) == MODE_VECTOR_BOOL
? 16 : 128;
13619 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
13620 return MIN (align
, 128);
13623 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13624 static HOST_WIDE_INT
13625 aarch64_vectorize_preferred_vector_alignment (const_tree type
)
13627 if (aarch64_sve_data_mode_p (TYPE_MODE (type
)))
13629 /* If the length of the vector is fixed, try to align to that length,
13630 otherwise don't try to align at all. */
13631 HOST_WIDE_INT result
;
13632 if (!BITS_PER_SVE_VECTOR
.is_constant (&result
))
13633 result
= TYPE_ALIGN (TREE_TYPE (type
));
13636 return TYPE_ALIGN (type
);
13639 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13641 aarch64_simd_vector_alignment_reachable (const_tree type
, bool is_packed
)
13646 /* For fixed-length vectors, check that the vectorizer will aim for
13647 full-vector alignment. This isn't true for generic GCC vectors
13648 that are wider than the ABI maximum of 128 bits. */
13649 if (TREE_CODE (TYPE_SIZE (type
)) == INTEGER_CST
13650 && (wi::to_widest (TYPE_SIZE (type
))
13651 != aarch64_vectorize_preferred_vector_alignment (type
)))
13654 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13658 /* Return true if the vector misalignment factor is supported by the
13661 aarch64_builtin_support_vector_misalignment (machine_mode mode
,
13662 const_tree type
, int misalignment
,
13665 if (TARGET_SIMD
&& STRICT_ALIGNMENT
)
13667 /* Return if movmisalign pattern is not supported for this mode. */
13668 if (optab_handler (movmisalign_optab
, mode
) == CODE_FOR_nothing
)
13671 /* Misalignment factor is unknown at compile time. */
13672 if (misalignment
== -1)
13675 return default_builtin_support_vector_misalignment (mode
, type
, misalignment
,
13679 /* If VALS is a vector constant that can be loaded into a register
13680 using DUP, generate instructions to do so and return an RTX to
13681 assign to the register. Otherwise return NULL_RTX. */
13683 aarch64_simd_dup_constant (rtx vals
)
13685 machine_mode mode
= GET_MODE (vals
);
13686 machine_mode inner_mode
= GET_MODE_INNER (mode
);
13689 if (!const_vec_duplicate_p (vals
, &x
))
13692 /* We can load this constant by using DUP and a constant in a
13693 single ARM register. This will be cheaper than a vector
13695 x
= copy_to_mode_reg (inner_mode
, x
);
13696 return gen_vec_duplicate (mode
, x
);
13700 /* Generate code to load VALS, which is a PARALLEL containing only
13701 constants (for vec_init) or CONST_VECTOR, efficiently into a
13702 register. Returns an RTX to copy into the register, or NULL_RTX
13703 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13705 aarch64_simd_make_constant (rtx vals
)
13707 machine_mode mode
= GET_MODE (vals
);
13709 rtx const_vec
= NULL_RTX
;
13713 if (GET_CODE (vals
) == CONST_VECTOR
)
13715 else if (GET_CODE (vals
) == PARALLEL
)
13717 /* A CONST_VECTOR must contain only CONST_INTs and
13718 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13719 Only store valid constants in a CONST_VECTOR. */
13720 int n_elts
= XVECLEN (vals
, 0);
13721 for (i
= 0; i
< n_elts
; ++i
)
13723 rtx x
= XVECEXP (vals
, 0, i
);
13724 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13727 if (n_const
== n_elts
)
13728 const_vec
= gen_rtx_CONST_VECTOR (mode
, XVEC (vals
, 0));
13731 gcc_unreachable ();
13733 if (const_vec
!= NULL_RTX
13734 && aarch64_simd_valid_immediate (const_vec
, NULL
))
13735 /* Load using MOVI/MVNI. */
13737 else if ((const_dup
= aarch64_simd_dup_constant (vals
)) != NULL_RTX
)
13738 /* Loaded using DUP. */
13740 else if (const_vec
!= NULL_RTX
)
13741 /* Load from constant pool. We can not take advantage of single-cycle
13742 LD1 because we need a PC-relative addressing mode. */
13745 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13746 We can not construct an initializer. */
13750 /* Expand a vector initialisation sequence, such that TARGET is
13751 initialised to contain VALS. */
13754 aarch64_expand_vector_init (rtx target
, rtx vals
)
13756 machine_mode mode
= GET_MODE (target
);
13757 scalar_mode inner_mode
= GET_MODE_INNER (mode
);
13758 /* The number of vector elements. */
13759 int n_elts
= XVECLEN (vals
, 0);
13760 /* The number of vector elements which are not constant. */
13762 rtx any_const
= NULL_RTX
;
13763 /* The first element of vals. */
13764 rtx v0
= XVECEXP (vals
, 0, 0);
13765 bool all_same
= true;
13767 /* Count the number of variable elements to initialise. */
13768 for (int i
= 0; i
< n_elts
; ++i
)
13770 rtx x
= XVECEXP (vals
, 0, i
);
13771 if (!(CONST_INT_P (x
) || CONST_DOUBLE_P (x
)))
13776 all_same
&= rtx_equal_p (x
, v0
);
13779 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13780 how best to handle this. */
13783 rtx constant
= aarch64_simd_make_constant (vals
);
13784 if (constant
!= NULL_RTX
)
13786 emit_move_insn (target
, constant
);
13791 /* Splat a single non-constant element if we can. */
13794 rtx x
= copy_to_mode_reg (inner_mode
, v0
);
13795 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13799 enum insn_code icode
= optab_handler (vec_set_optab
, mode
);
13800 gcc_assert (icode
!= CODE_FOR_nothing
);
13802 /* If there are only variable elements, try to optimize
13803 the insertion using dup for the most common element
13804 followed by insertions. */
13806 /* The algorithm will fill matches[*][0] with the earliest matching element,
13807 and matches[X][1] with the count of duplicate elements (if X is the
13808 earliest element which has duplicates). */
13810 if (n_var
== n_elts
&& n_elts
<= 16)
13812 int matches
[16][2] = {0};
13813 for (int i
= 0; i
< n_elts
; i
++)
13815 for (int j
= 0; j
<= i
; j
++)
13817 if (rtx_equal_p (XVECEXP (vals
, 0, i
), XVECEXP (vals
, 0, j
)))
13825 int maxelement
= 0;
13827 for (int i
= 0; i
< n_elts
; i
++)
13828 if (matches
[i
][1] > maxv
)
13831 maxv
= matches
[i
][1];
13834 /* Create a duplicate of the most common element, unless all elements
13835 are equally useless to us, in which case just immediately set the
13836 vector register using the first element. */
13840 /* For vectors of two 64-bit elements, we can do even better. */
13842 && (inner_mode
== E_DImode
13843 || inner_mode
== E_DFmode
))
13846 rtx x0
= XVECEXP (vals
, 0, 0);
13847 rtx x1
= XVECEXP (vals
, 0, 1);
13848 /* Combine can pick up this case, but handling it directly
13849 here leaves clearer RTL.
13851 This is load_pair_lanes<mode>, and also gives us a clean-up
13852 for store_pair_lanes<mode>. */
13853 if (memory_operand (x0
, inner_mode
)
13854 && memory_operand (x1
, inner_mode
)
13855 && !STRICT_ALIGNMENT
13856 && rtx_equal_p (XEXP (x1
, 0),
13857 plus_constant (Pmode
,
13859 GET_MODE_SIZE (inner_mode
))))
13862 if (inner_mode
== DFmode
)
13863 t
= gen_load_pair_lanesdf (target
, x0
, x1
);
13865 t
= gen_load_pair_lanesdi (target
, x0
, x1
);
13870 /* The subreg-move sequence below will move into lane zero of the
13871 vector register. For big-endian we want that position to hold
13872 the last element of VALS. */
13873 maxelement
= BYTES_BIG_ENDIAN
? n_elts
- 1 : 0;
13874 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13875 aarch64_emit_move (target
, lowpart_subreg (mode
, x
, inner_mode
));
13879 rtx x
= copy_to_mode_reg (inner_mode
, XVECEXP (vals
, 0, maxelement
));
13880 aarch64_emit_move (target
, gen_vec_duplicate (mode
, x
));
13883 /* Insert the rest. */
13884 for (int i
= 0; i
< n_elts
; i
++)
13886 rtx x
= XVECEXP (vals
, 0, i
);
13887 if (matches
[i
][0] == maxelement
)
13889 x
= copy_to_mode_reg (inner_mode
, x
);
13890 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
13895 /* Initialise a vector which is part-variable. We want to first try
13896 to build those lanes which are constant in the most efficient way we
13898 if (n_var
!= n_elts
)
13900 rtx copy
= copy_rtx (vals
);
13902 /* Load constant part of vector. We really don't care what goes into the
13903 parts we will overwrite, but we're more likely to be able to load the
13904 constant efficiently if it has fewer, larger, repeating parts
13905 (see aarch64_simd_valid_immediate). */
13906 for (int i
= 0; i
< n_elts
; i
++)
13908 rtx x
= XVECEXP (vals
, 0, i
);
13909 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13911 rtx subst
= any_const
;
13912 for (int bit
= n_elts
/ 2; bit
> 0; bit
/= 2)
13914 /* Look in the copied vector, as more elements are const. */
13915 rtx test
= XVECEXP (copy
, 0, i
^ bit
);
13916 if (CONST_INT_P (test
) || CONST_DOUBLE_P (test
))
13922 XVECEXP (copy
, 0, i
) = subst
;
13924 aarch64_expand_vector_init (target
, copy
);
13927 /* Insert the variable lanes directly. */
13928 for (int i
= 0; i
< n_elts
; i
++)
13930 rtx x
= XVECEXP (vals
, 0, i
);
13931 if (CONST_INT_P (x
) || CONST_DOUBLE_P (x
))
13933 x
= copy_to_mode_reg (inner_mode
, x
);
13934 emit_insn (GEN_FCN (icode
) (target
, x
, GEN_INT (i
)));
13938 static unsigned HOST_WIDE_INT
13939 aarch64_shift_truncation_mask (machine_mode mode
)
13941 if (!SHIFT_COUNT_TRUNCATED
|| aarch64_vector_data_mode_p (mode
))
13943 return GET_MODE_UNIT_BITSIZE (mode
) - 1;
13946 /* Select a format to encode pointers in exception handling data. */
13948 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED
, int global
)
13951 switch (aarch64_cmodel
)
13953 case AARCH64_CMODEL_TINY
:
13954 case AARCH64_CMODEL_TINY_PIC
:
13955 case AARCH64_CMODEL_SMALL
:
13956 case AARCH64_CMODEL_SMALL_PIC
:
13957 case AARCH64_CMODEL_SMALL_SPIC
:
13958 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13960 type
= DW_EH_PE_sdata4
;
13963 /* No assumptions here. 8-byte relocs required. */
13964 type
= DW_EH_PE_sdata8
;
13967 return (global
? DW_EH_PE_indirect
: 0) | DW_EH_PE_pcrel
| type
;
13970 /* The last .arch and .tune assembly strings that we printed. */
13971 static std::string aarch64_last_printed_arch_string
;
13972 static std::string aarch64_last_printed_tune_string
;
13974 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
13975 by the function fndecl. */
13978 aarch64_declare_function_name (FILE *stream
, const char* name
,
13981 tree target_parts
= DECL_FUNCTION_SPECIFIC_TARGET (fndecl
);
13983 struct cl_target_option
*targ_options
;
13985 targ_options
= TREE_TARGET_OPTION (target_parts
);
13987 targ_options
= TREE_TARGET_OPTION (target_option_current_node
);
13988 gcc_assert (targ_options
);
13990 const struct processor
*this_arch
13991 = aarch64_get_arch (targ_options
->x_explicit_arch
);
13993 unsigned long isa_flags
= targ_options
->x_aarch64_isa_flags
;
13994 std::string extension
13995 = aarch64_get_extension_string_for_isa_flags (isa_flags
,
13997 /* Only update the assembler .arch string if it is distinct from the last
13998 such string we printed. */
13999 std::string to_print
= this_arch
->name
+ extension
;
14000 if (to_print
!= aarch64_last_printed_arch_string
)
14002 asm_fprintf (asm_out_file
, "\t.arch %s\n", to_print
.c_str ());
14003 aarch64_last_printed_arch_string
= to_print
;
14006 /* Print the cpu name we're tuning for in the comments, might be
14007 useful to readers of the generated asm. Do it only when it changes
14008 from function to function and verbose assembly is requested. */
14009 const struct processor
*this_tune
14010 = aarch64_get_tune_cpu (targ_options
->x_explicit_tune_core
);
14012 if (flag_debug_asm
&& aarch64_last_printed_tune_string
!= this_tune
->name
)
14014 asm_fprintf (asm_out_file
, "\t" ASM_COMMENT_START
".tune %s\n",
14016 aarch64_last_printed_tune_string
= this_tune
->name
;
14019 /* Don't forget the type directive for ELF. */
14020 ASM_OUTPUT_TYPE_DIRECTIVE (stream
, name
, "function");
14021 ASM_OUTPUT_LABEL (stream
, name
);
14024 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14027 aarch64_start_file (void)
14029 struct cl_target_option
*default_options
14030 = TREE_TARGET_OPTION (target_option_default_node
);
14032 const struct processor
*default_arch
14033 = aarch64_get_arch (default_options
->x_explicit_arch
);
14034 unsigned long default_isa_flags
= default_options
->x_aarch64_isa_flags
;
14035 std::string extension
14036 = aarch64_get_extension_string_for_isa_flags (default_isa_flags
,
14037 default_arch
->flags
);
14039 aarch64_last_printed_arch_string
= default_arch
->name
+ extension
;
14040 aarch64_last_printed_tune_string
= "";
14041 asm_fprintf (asm_out_file
, "\t.arch %s\n",
14042 aarch64_last_printed_arch_string
.c_str ());
14044 default_file_start ();
14047 /* Emit load exclusive. */
14050 aarch64_emit_load_exclusive (machine_mode mode
, rtx rval
,
14051 rtx mem
, rtx model_rtx
)
14053 emit_insn (gen_aarch64_load_exclusive (mode
, rval
, mem
, model_rtx
));
14056 /* Emit store exclusive. */
14059 aarch64_emit_store_exclusive (machine_mode mode
, rtx bval
,
14060 rtx rval
, rtx mem
, rtx model_rtx
)
14062 emit_insn (gen_aarch64_store_exclusive (mode
, bval
, rval
, mem
, model_rtx
));
14065 /* Mark the previous jump instruction as unlikely. */
14068 aarch64_emit_unlikely_jump (rtx insn
)
14070 rtx_insn
*jump
= emit_jump_insn (insn
);
14071 add_reg_br_prob_note (jump
, profile_probability::very_unlikely ());
14074 /* Expand a compare and swap pattern. */
14077 aarch64_expand_compare_and_swap (rtx operands
[])
14079 rtx bval
, rval
, mem
, oldval
, newval
, is_weak
, mod_s
, mod_f
, x
;
14080 machine_mode mode
, cmp_mode
;
14082 bval
= operands
[0];
14083 rval
= operands
[1];
14085 oldval
= operands
[3];
14086 newval
= operands
[4];
14087 is_weak
= operands
[5];
14088 mod_s
= operands
[6];
14089 mod_f
= operands
[7];
14090 mode
= GET_MODE (mem
);
14093 /* Normally the succ memory model must be stronger than fail, but in the
14094 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14095 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14097 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f
)))
14098 && is_mm_release (memmodel_from_int (INTVAL (mod_s
))))
14099 mod_s
= GEN_INT (MEMMODEL_ACQ_REL
);
14105 /* For short modes, we're going to perform the comparison in SImode,
14106 so do the zero-extension now. */
14108 rval
= gen_reg_rtx (SImode
);
14109 oldval
= convert_modes (SImode
, mode
, oldval
, true);
14110 /* Fall through. */
14114 /* Force the value into a register if needed. */
14115 if (!aarch64_plus_operand (oldval
, mode
))
14116 oldval
= force_reg (cmp_mode
, oldval
);
14120 gcc_unreachable ();
14124 emit_insn (gen_aarch64_compare_and_swap_lse (mode
, rval
, mem
, oldval
,
14125 newval
, is_weak
, mod_s
,
14128 emit_insn (gen_aarch64_compare_and_swap (mode
, rval
, mem
, oldval
, newval
,
14129 is_weak
, mod_s
, mod_f
));
14132 if (mode
== QImode
|| mode
== HImode
)
14133 emit_move_insn (operands
[1], gen_lowpart (mode
, rval
));
14135 x
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14136 x
= gen_rtx_EQ (SImode
, x
, const0_rtx
);
14137 emit_insn (gen_rtx_SET (bval
, x
));
14140 /* Test whether the target supports using a atomic load-operate instruction.
14141 CODE is the operation and AFTER is TRUE if the data in memory after the
14142 operation should be returned and FALSE if the data before the operation
14143 should be returned. Returns FALSE if the operation isn't supported by the
14147 aarch64_atomic_ldop_supported_p (enum rtx_code code
)
14166 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14167 sequence implementing an atomic operation. */
14170 aarch64_emit_post_barrier (enum memmodel model
)
14172 const enum memmodel base_model
= memmodel_base (model
);
14174 if (is_mm_sync (model
)
14175 && (base_model
== MEMMODEL_ACQUIRE
14176 || base_model
== MEMMODEL_ACQ_REL
14177 || base_model
== MEMMODEL_SEQ_CST
))
14179 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST
)));
14183 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14184 for the data in memory. EXPECTED is the value expected to be in memory.
14185 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14186 is the memory ordering to use. */
14189 aarch64_gen_atomic_cas (rtx rval
, rtx mem
,
14190 rtx expected
, rtx desired
,
14195 mode
= GET_MODE (mem
);
14197 /* Move the expected value into the CAS destination register. */
14198 emit_insn (gen_rtx_SET (rval
, expected
));
14200 /* Emit the CAS. */
14201 emit_insn (gen_aarch64_atomic_cas (mode
, rval
, mem
, desired
, model
));
14203 /* Compare the expected value with the value loaded by the CAS, to establish
14204 whether the swap was made. */
14205 aarch64_gen_compare_reg (EQ
, rval
, expected
);
14208 /* Split a compare and swap pattern. */
14211 aarch64_split_compare_and_swap (rtx operands
[])
14213 rtx rval
, mem
, oldval
, newval
, scratch
;
14216 rtx_code_label
*label1
, *label2
;
14218 enum memmodel model
;
14221 rval
= operands
[0];
14223 oldval
= operands
[2];
14224 newval
= operands
[3];
14225 is_weak
= (operands
[4] != const0_rtx
);
14226 model_rtx
= operands
[5];
14227 scratch
= operands
[7];
14228 mode
= GET_MODE (mem
);
14229 model
= memmodel_from_int (INTVAL (model_rtx
));
14231 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14234 LD[A]XR rval, [mem]
14236 ST[L]XR scratch, newval, [mem]
14237 CBNZ scratch, .label1
14240 bool strong_zero_p
= !is_weak
&& oldval
== const0_rtx
;
14245 label1
= gen_label_rtx ();
14246 emit_label (label1
);
14248 label2
= gen_label_rtx ();
14250 /* The initial load can be relaxed for a __sync operation since a final
14251 barrier will be emitted to stop code hoisting. */
14252 if (is_mm_sync (model
))
14253 aarch64_emit_load_exclusive (mode
, rval
, mem
,
14254 GEN_INT (MEMMODEL_RELAXED
));
14256 aarch64_emit_load_exclusive (mode
, rval
, mem
, model_rtx
);
14260 if (aarch64_track_speculation
)
14262 /* Emit an explicit compare instruction, so that we can correctly
14263 track the condition codes. */
14264 rtx cc_reg
= aarch64_gen_compare_reg (NE
, rval
, const0_rtx
);
14265 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
14268 x
= gen_rtx_NE (VOIDmode
, rval
, const0_rtx
);
14270 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14271 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14272 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14276 cond
= aarch64_gen_compare_reg (NE
, rval
, oldval
);
14277 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14278 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14279 gen_rtx_LABEL_REF (Pmode
, label2
), pc_rtx
);
14280 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14283 aarch64_emit_store_exclusive (mode
, scratch
, mem
, newval
, model_rtx
);
14287 if (aarch64_track_speculation
)
14289 /* Emit an explicit compare instruction, so that we can correctly
14290 track the condition codes. */
14291 rtx cc_reg
= aarch64_gen_compare_reg (NE
, scratch
, const0_rtx
);
14292 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
14295 x
= gen_rtx_NE (VOIDmode
, scratch
, const0_rtx
);
14297 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14298 gen_rtx_LABEL_REF (Pmode
, label1
), pc_rtx
);
14299 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14303 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14304 x
= gen_rtx_COMPARE (CCmode
, scratch
, const0_rtx
);
14305 emit_insn (gen_rtx_SET (cond
, x
));
14308 emit_label (label2
);
14309 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14310 to set the condition flags. If this is not used it will be removed by
14314 cond
= gen_rtx_REG (CCmode
, CC_REGNUM
);
14315 x
= gen_rtx_COMPARE (CCmode
, rval
, const0_rtx
);
14316 emit_insn (gen_rtx_SET (cond
, x
));
14318 /* Emit any final barrier needed for a __sync operation. */
14319 if (is_mm_sync (model
))
14320 aarch64_emit_post_barrier (model
);
14323 /* Emit a BIC instruction. */
14326 aarch64_emit_bic (machine_mode mode
, rtx dst
, rtx s1
, rtx s2
, int shift
)
14328 rtx shift_rtx
= GEN_INT (shift
);
14329 rtx (*gen
) (rtx
, rtx
, rtx
, rtx
);
14333 case E_SImode
: gen
= gen_and_one_cmpl_lshrsi3
; break;
14334 case E_DImode
: gen
= gen_and_one_cmpl_lshrdi3
; break;
14336 gcc_unreachable ();
14339 emit_insn (gen (dst
, s2
, shift_rtx
, s1
));
14342 /* Emit an atomic swap. */
14345 aarch64_emit_atomic_swap (machine_mode mode
, rtx dst
, rtx value
,
14346 rtx mem
, rtx model
)
14348 emit_insn (gen_aarch64_atomic_swp (mode
, dst
, mem
, value
, model
));
14351 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14352 location to store the data read from memory. OUT_RESULT is the location to
14353 store the result of the operation. MEM is the memory location to read and
14354 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14355 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14359 aarch64_gen_atomic_ldop (enum rtx_code code
, rtx out_data
, rtx out_result
,
14360 rtx mem
, rtx value
, rtx model_rtx
)
14362 machine_mode mode
= GET_MODE (mem
);
14363 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14364 const bool short_mode
= (mode
< SImode
);
14370 out_data
= gen_lowpart (mode
, out_data
);
14373 out_result
= gen_lowpart (mode
, out_result
);
14375 /* Make sure the value is in a register, putting it into a destination
14376 register if it needs to be manipulated. */
14377 if (!register_operand (value
, mode
)
14378 || code
== AND
|| code
== MINUS
)
14380 src
= out_result
? out_result
: out_data
;
14381 emit_move_insn (src
, gen_lowpart (mode
, value
));
14385 gcc_assert (register_operand (src
, mode
));
14387 /* Preprocess the data for the operation as necessary. If the operation is
14388 a SET then emit a swap instruction and finish. */
14392 aarch64_emit_atomic_swap (mode
, out_data
, src
, mem
, model_rtx
);
14396 /* Negate the value and treat it as a PLUS. */
14400 /* Resize the value if necessary. */
14402 src
= gen_lowpart (wmode
, src
);
14404 neg_src
= gen_rtx_NEG (wmode
, src
);
14405 emit_insn (gen_rtx_SET (src
, neg_src
));
14408 src
= gen_lowpart (mode
, src
);
14410 /* Fall-through. */
14412 ldop_code
= UNSPECV_ATOMIC_LDOP_PLUS
;
14416 ldop_code
= UNSPECV_ATOMIC_LDOP_OR
;
14420 ldop_code
= UNSPECV_ATOMIC_LDOP_XOR
;
14427 /* Resize the value if necessary. */
14429 src
= gen_lowpart (wmode
, src
);
14431 not_src
= gen_rtx_NOT (wmode
, src
);
14432 emit_insn (gen_rtx_SET (src
, not_src
));
14435 src
= gen_lowpart (mode
, src
);
14437 ldop_code
= UNSPECV_ATOMIC_LDOP_BIC
;
14441 /* The operation can't be done with atomic instructions. */
14442 gcc_unreachable ();
14445 emit_insn (gen_aarch64_atomic_load (ldop_code
, mode
,
14446 out_data
, mem
, src
, model_rtx
));
14448 /* If necessary, calculate the data in memory after the update by redoing the
14449 operation from values in registers. */
14455 src
= gen_lowpart (wmode
, src
);
14456 out_data
= gen_lowpart (wmode
, out_data
);
14457 out_result
= gen_lowpart (wmode
, out_result
);
14466 x
= gen_rtx_PLUS (wmode
, out_data
, src
);
14469 x
= gen_rtx_IOR (wmode
, out_data
, src
);
14472 x
= gen_rtx_XOR (wmode
, out_data
, src
);
14475 aarch64_emit_bic (wmode
, out_result
, out_data
, src
, 0);
14478 gcc_unreachable ();
14481 emit_set_insn (out_result
, x
);
14486 /* Split an atomic operation. */
14489 aarch64_split_atomic_op (enum rtx_code code
, rtx old_out
, rtx new_out
, rtx mem
,
14490 rtx value
, rtx model_rtx
, rtx cond
)
14492 machine_mode mode
= GET_MODE (mem
);
14493 machine_mode wmode
= (mode
== DImode
? DImode
: SImode
);
14494 const enum memmodel model
= memmodel_from_int (INTVAL (model_rtx
));
14495 const bool is_sync
= is_mm_sync (model
);
14496 rtx_code_label
*label
;
14499 /* Split the atomic operation into a sequence. */
14500 label
= gen_label_rtx ();
14501 emit_label (label
);
14504 new_out
= gen_lowpart (wmode
, new_out
);
14506 old_out
= gen_lowpart (wmode
, old_out
);
14509 value
= simplify_gen_subreg (wmode
, value
, mode
, 0);
14511 /* The initial load can be relaxed for a __sync operation since a final
14512 barrier will be emitted to stop code hoisting. */
14514 aarch64_emit_load_exclusive (mode
, old_out
, mem
,
14515 GEN_INT (MEMMODEL_RELAXED
));
14517 aarch64_emit_load_exclusive (mode
, old_out
, mem
, model_rtx
);
14526 x
= gen_rtx_AND (wmode
, old_out
, value
);
14527 emit_insn (gen_rtx_SET (new_out
, x
));
14528 x
= gen_rtx_NOT (wmode
, new_out
);
14529 emit_insn (gen_rtx_SET (new_out
, x
));
14533 if (CONST_INT_P (value
))
14535 value
= GEN_INT (-INTVAL (value
));
14538 /* Fall through. */
14541 x
= gen_rtx_fmt_ee (code
, wmode
, old_out
, value
);
14542 emit_insn (gen_rtx_SET (new_out
, x
));
14546 aarch64_emit_store_exclusive (mode
, cond
, mem
,
14547 gen_lowpart (mode
, new_out
), model_rtx
);
14549 if (aarch64_track_speculation
)
14551 /* Emit an explicit compare instruction, so that we can correctly
14552 track the condition codes. */
14553 rtx cc_reg
= aarch64_gen_compare_reg (NE
, cond
, const0_rtx
);
14554 x
= gen_rtx_NE (GET_MODE (cc_reg
), cc_reg
, const0_rtx
);
14557 x
= gen_rtx_NE (VOIDmode
, cond
, const0_rtx
);
14559 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
14560 gen_rtx_LABEL_REF (Pmode
, label
), pc_rtx
);
14561 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
14563 /* Emit any final barrier needed for a __sync operation. */
14565 aarch64_emit_post_barrier (model
);
14569 aarch64_init_libfuncs (void)
14571 /* Half-precision float operations. The compiler handles all operations
14572 with NULL libfuncs by converting to SFmode. */
14575 set_conv_libfunc (trunc_optab
, HFmode
, SFmode
, "__gnu_f2h_ieee");
14576 set_conv_libfunc (sext_optab
, SFmode
, HFmode
, "__gnu_h2f_ieee");
14579 set_optab_libfunc (add_optab
, HFmode
, NULL
);
14580 set_optab_libfunc (sdiv_optab
, HFmode
, NULL
);
14581 set_optab_libfunc (smul_optab
, HFmode
, NULL
);
14582 set_optab_libfunc (neg_optab
, HFmode
, NULL
);
14583 set_optab_libfunc (sub_optab
, HFmode
, NULL
);
14586 set_optab_libfunc (eq_optab
, HFmode
, NULL
);
14587 set_optab_libfunc (ne_optab
, HFmode
, NULL
);
14588 set_optab_libfunc (lt_optab
, HFmode
, NULL
);
14589 set_optab_libfunc (le_optab
, HFmode
, NULL
);
14590 set_optab_libfunc (ge_optab
, HFmode
, NULL
);
14591 set_optab_libfunc (gt_optab
, HFmode
, NULL
);
14592 set_optab_libfunc (unord_optab
, HFmode
, NULL
);
14595 /* Target hook for c_mode_for_suffix. */
14596 static machine_mode
14597 aarch64_c_mode_for_suffix (char suffix
)
14605 /* We can only represent floating point constants which will fit in
14606 "quarter-precision" values. These values are characterised by
14607 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14610 (-1)^s * (n/16) * 2^r
14613 's' is the sign bit.
14614 'n' is an integer in the range 16 <= n <= 31.
14615 'r' is an integer in the range -3 <= r <= 4. */
14617 /* Return true iff X can be represented by a quarter-precision
14618 floating point immediate operand X. Note, we cannot represent 0.0. */
14620 aarch64_float_const_representable_p (rtx x
)
14622 /* This represents our current view of how many bits
14623 make up the mantissa. */
14624 int point_pos
= 2 * HOST_BITS_PER_WIDE_INT
- 1;
14626 unsigned HOST_WIDE_INT mantissa
, mask
;
14627 REAL_VALUE_TYPE r
, m
;
14630 if (!CONST_DOUBLE_P (x
))
14633 if (GET_MODE (x
) == VOIDmode
14634 || (GET_MODE (x
) == HFmode
&& !TARGET_FP_F16INST
))
14637 r
= *CONST_DOUBLE_REAL_VALUE (x
);
14639 /* We cannot represent infinities, NaNs or +/-zero. We won't
14640 know if we have +zero until we analyse the mantissa, but we
14641 can reject the other invalid values. */
14642 if (REAL_VALUE_ISINF (r
) || REAL_VALUE_ISNAN (r
)
14643 || REAL_VALUE_MINUS_ZERO (r
))
14646 /* Extract exponent. */
14647 r
= real_value_abs (&r
);
14648 exponent
= REAL_EXP (&r
);
14650 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14651 highest (sign) bit, with a fixed binary point at bit point_pos.
14652 m1 holds the low part of the mantissa, m2 the high part.
14653 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14654 bits for the mantissa, this can fail (low bits will be lost). */
14655 real_ldexp (&m
, &r
, point_pos
- exponent
);
14656 wide_int w
= real_to_integer (&m
, &fail
, HOST_BITS_PER_WIDE_INT
* 2);
14658 /* If the low part of the mantissa has bits set we cannot represent
14660 if (w
.ulow () != 0)
14662 /* We have rejected the lower HOST_WIDE_INT, so update our
14663 understanding of how many bits lie in the mantissa and
14664 look only at the high HOST_WIDE_INT. */
14665 mantissa
= w
.elt (1);
14666 point_pos
-= HOST_BITS_PER_WIDE_INT
;
14668 /* We can only represent values with a mantissa of the form 1.xxxx. */
14669 mask
= ((unsigned HOST_WIDE_INT
)1 << (point_pos
- 5)) - 1;
14670 if ((mantissa
& mask
) != 0)
14673 /* Having filtered unrepresentable values, we may now remove all
14674 but the highest 5 bits. */
14675 mantissa
>>= point_pos
- 5;
14677 /* We cannot represent the value 0.0, so reject it. This is handled
14682 /* Then, as bit 4 is always set, we can mask it off, leaving
14683 the mantissa in the range [0, 15]. */
14684 mantissa
&= ~(1 << 4);
14685 gcc_assert (mantissa
<= 15);
14687 /* GCC internally does not use IEEE754-like encoding (where normalized
14688 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14689 Our mantissa values are shifted 4 places to the left relative to
14690 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14691 by 5 places to correct for GCC's representation. */
14692 exponent
= 5 - exponent
;
14694 return (exponent
>= 0 && exponent
<= 7);
14697 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14698 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14699 output MOVI/MVNI, ORR or BIC immediate. */
14701 aarch64_output_simd_mov_immediate (rtx const_vector
, unsigned width
,
14702 enum simd_immediate_check which
)
14705 static char templ
[40];
14706 const char *mnemonic
;
14707 const char *shift_op
;
14708 unsigned int lane_count
= 0;
14711 struct simd_immediate_info info
;
14713 /* This will return true to show const_vector is legal for use as either
14714 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14715 It will also update INFO to show how the immediate should be generated.
14716 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14717 is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
, which
);
14718 gcc_assert (is_valid
);
14720 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14721 lane_count
= width
/ GET_MODE_BITSIZE (info
.elt_mode
);
14723 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14725 gcc_assert (info
.shift
== 0 && info
.insn
== simd_immediate_info::MOV
);
14726 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14727 move immediate path. */
14728 if (aarch64_float_const_zero_rtx_p (info
.value
))
14729 info
.value
= GEN_INT (0);
14732 const unsigned int buf_size
= 20;
14733 char float_buf
[buf_size
] = {'\0'};
14734 real_to_decimal_for_mode (float_buf
,
14735 CONST_DOUBLE_REAL_VALUE (info
.value
),
14736 buf_size
, buf_size
, 1, info
.elt_mode
);
14738 if (lane_count
== 1)
14739 snprintf (templ
, sizeof (templ
), "fmov\t%%d0, %s", float_buf
);
14741 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%d%c, %s",
14742 lane_count
, element_char
, float_buf
);
14747 gcc_assert (CONST_INT_P (info
.value
));
14749 if (which
== AARCH64_CHECK_MOV
)
14751 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "mvni" : "movi";
14752 shift_op
= info
.modifier
== simd_immediate_info::MSL
? "msl" : "lsl";
14753 if (lane_count
== 1)
14754 snprintf (templ
, sizeof (templ
), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX
,
14755 mnemonic
, UINTVAL (info
.value
));
14756 else if (info
.shift
)
14757 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14758 HOST_WIDE_INT_PRINT_HEX
", %s %d", mnemonic
, lane_count
,
14759 element_char
, UINTVAL (info
.value
), shift_op
, info
.shift
);
14761 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, "
14762 HOST_WIDE_INT_PRINT_HEX
, mnemonic
, lane_count
,
14763 element_char
, UINTVAL (info
.value
));
14767 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14768 mnemonic
= info
.insn
== simd_immediate_info::MVN
? "bic" : "orr";
14770 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14771 HOST_WIDE_INT_PRINT_DEC
", %s #%d", mnemonic
, lane_count
,
14772 element_char
, UINTVAL (info
.value
), "lsl", info
.shift
);
14774 snprintf (templ
, sizeof (templ
), "%s\t%%0.%d%c, #"
14775 HOST_WIDE_INT_PRINT_DEC
, mnemonic
, lane_count
,
14776 element_char
, UINTVAL (info
.value
));
14782 aarch64_output_scalar_simd_mov_immediate (rtx immediate
, scalar_int_mode mode
)
14785 /* If a floating point number was passed and we desire to use it in an
14786 integer mode do the conversion to integer. */
14787 if (CONST_DOUBLE_P (immediate
) && GET_MODE_CLASS (mode
) == MODE_INT
)
14789 unsigned HOST_WIDE_INT ival
;
14790 if (!aarch64_reinterpret_float_as_int (immediate
, &ival
))
14791 gcc_unreachable ();
14792 immediate
= gen_int_mode (ival
, mode
);
14795 machine_mode vmode
;
14796 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14797 a 128 bit vector mode. */
14798 int width
= GET_MODE_BITSIZE (mode
) == 64 ? 128 : 64;
14800 vmode
= aarch64_simd_container_mode (mode
, width
);
14801 rtx v_op
= aarch64_simd_gen_const_vector_dup (vmode
, INTVAL (immediate
));
14802 return aarch64_output_simd_mov_immediate (v_op
, width
);
14805 /* Return the output string to use for moving immediate CONST_VECTOR
14806 into an SVE register. */
14809 aarch64_output_sve_mov_immediate (rtx const_vector
)
14811 static char templ
[40];
14812 struct simd_immediate_info info
;
14815 bool is_valid
= aarch64_simd_valid_immediate (const_vector
, &info
);
14816 gcc_assert (is_valid
);
14818 element_char
= sizetochar (GET_MODE_BITSIZE (info
.elt_mode
));
14822 snprintf (templ
, sizeof (templ
), "index\t%%0.%c, #"
14823 HOST_WIDE_INT_PRINT_DEC
", #" HOST_WIDE_INT_PRINT_DEC
,
14824 element_char
, INTVAL (info
.value
), INTVAL (info
.step
));
14828 if (GET_MODE_CLASS (info
.elt_mode
) == MODE_FLOAT
)
14830 if (aarch64_float_const_zero_rtx_p (info
.value
))
14831 info
.value
= GEN_INT (0);
14834 const int buf_size
= 20;
14835 char float_buf
[buf_size
] = {};
14836 real_to_decimal_for_mode (float_buf
,
14837 CONST_DOUBLE_REAL_VALUE (info
.value
),
14838 buf_size
, buf_size
, 1, info
.elt_mode
);
14840 snprintf (templ
, sizeof (templ
), "fmov\t%%0.%c, #%s",
14841 element_char
, float_buf
);
14846 snprintf (templ
, sizeof (templ
), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC
,
14847 element_char
, INTVAL (info
.value
));
14851 /* Return the asm format for a PTRUE instruction whose destination has
14852 mode MODE. SUFFIX is the element size suffix. */
14855 aarch64_output_ptrue (machine_mode mode
, char suffix
)
14857 unsigned int nunits
;
14858 static char buf
[sizeof ("ptrue\t%0.N, vlNNNNN")];
14859 if (GET_MODE_NUNITS (mode
).is_constant (&nunits
))
14860 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, vl%d", suffix
, nunits
);
14862 snprintf (buf
, sizeof (buf
), "ptrue\t%%0.%c, all", suffix
);
14866 /* Split operands into moves from op[1] + op[2] into op[0]. */
14869 aarch64_split_combinev16qi (rtx operands
[3])
14871 unsigned int dest
= REGNO (operands
[0]);
14872 unsigned int src1
= REGNO (operands
[1]);
14873 unsigned int src2
= REGNO (operands
[2]);
14874 machine_mode halfmode
= GET_MODE (operands
[1]);
14875 unsigned int halfregs
= REG_NREGS (operands
[1]);
14876 rtx destlo
, desthi
;
14878 gcc_assert (halfmode
== V16QImode
);
14880 if (src1
== dest
&& src2
== dest
+ halfregs
)
14882 /* No-op move. Can't split to nothing; emit something. */
14883 emit_note (NOTE_INSN_DELETED
);
14887 /* Preserve register attributes for variable tracking. */
14888 destlo
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
, 0);
14889 desthi
= gen_rtx_REG_offset (operands
[0], halfmode
, dest
+ halfregs
,
14890 GET_MODE_SIZE (halfmode
));
14892 /* Special case of reversed high/low parts. */
14893 if (reg_overlap_mentioned_p (operands
[2], destlo
)
14894 && reg_overlap_mentioned_p (operands
[1], desthi
))
14896 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
14897 emit_insn (gen_xorv16qi3 (operands
[2], operands
[1], operands
[2]));
14898 emit_insn (gen_xorv16qi3 (operands
[1], operands
[1], operands
[2]));
14900 else if (!reg_overlap_mentioned_p (operands
[2], destlo
))
14902 /* Try to avoid unnecessary moves if part of the result
14903 is in the right place already. */
14905 emit_move_insn (destlo
, operands
[1]);
14906 if (src2
!= dest
+ halfregs
)
14907 emit_move_insn (desthi
, operands
[2]);
14911 if (src2
!= dest
+ halfregs
)
14912 emit_move_insn (desthi
, operands
[2]);
14914 emit_move_insn (destlo
, operands
[1]);
14918 /* vec_perm support. */
14920 struct expand_vec_perm_d
14922 rtx target
, op0
, op1
;
14923 vec_perm_indices perm
;
14924 machine_mode vmode
;
14925 unsigned int vec_flags
;
14930 /* Generate a variable permutation. */
14933 aarch64_expand_vec_perm_1 (rtx target
, rtx op0
, rtx op1
, rtx sel
)
14935 machine_mode vmode
= GET_MODE (target
);
14936 bool one_vector_p
= rtx_equal_p (op0
, op1
);
14938 gcc_checking_assert (vmode
== V8QImode
|| vmode
== V16QImode
);
14939 gcc_checking_assert (GET_MODE (op0
) == vmode
);
14940 gcc_checking_assert (GET_MODE (op1
) == vmode
);
14941 gcc_checking_assert (GET_MODE (sel
) == vmode
);
14942 gcc_checking_assert (TARGET_SIMD
);
14946 if (vmode
== V8QImode
)
14948 /* Expand the argument to a V16QI mode by duplicating it. */
14949 rtx pair
= gen_reg_rtx (V16QImode
);
14950 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op0
));
14951 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
14955 emit_insn (gen_aarch64_tbl1v16qi (target
, op0
, sel
));
14962 if (vmode
== V8QImode
)
14964 pair
= gen_reg_rtx (V16QImode
);
14965 emit_insn (gen_aarch64_combinev8qi (pair
, op0
, op1
));
14966 emit_insn (gen_aarch64_tbl1v8qi (target
, pair
, sel
));
14970 pair
= gen_reg_rtx (OImode
);
14971 emit_insn (gen_aarch64_combinev16qi (pair
, op0
, op1
));
14972 emit_insn (gen_aarch64_tbl2v16qi (target
, pair
, sel
));
14977 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14978 NELT is the number of elements in the vector. */
14981 aarch64_expand_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
,
14984 machine_mode vmode
= GET_MODE (target
);
14985 bool one_vector_p
= rtx_equal_p (op0
, op1
);
14988 /* The TBL instruction does not use a modulo index, so we must take care
14989 of that ourselves. */
14990 mask
= aarch64_simd_gen_const_vector_dup (vmode
,
14991 one_vector_p
? nelt
- 1 : 2 * nelt
- 1);
14992 sel
= expand_simple_binop (vmode
, AND
, sel
, mask
, NULL
, 0, OPTAB_LIB_WIDEN
);
14994 /* For big-endian, we also need to reverse the index within the vector
14995 (but not which vector). */
14996 if (BYTES_BIG_ENDIAN
)
14998 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15000 mask
= aarch64_simd_gen_const_vector_dup (vmode
, nelt
- 1);
15001 sel
= expand_simple_binop (vmode
, XOR
, sel
, mask
,
15002 NULL
, 0, OPTAB_LIB_WIDEN
);
15004 aarch64_expand_vec_perm_1 (target
, op0
, op1
, sel
);
15007 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15010 emit_unspec2 (rtx target
, int code
, rtx op0
, rtx op1
)
15012 emit_insn (gen_rtx_SET (target
,
15013 gen_rtx_UNSPEC (GET_MODE (target
),
15014 gen_rtvec (2, op0
, op1
), code
)));
15017 /* Expand an SVE vec_perm with the given operands. */
15020 aarch64_expand_sve_vec_perm (rtx target
, rtx op0
, rtx op1
, rtx sel
)
15022 machine_mode data_mode
= GET_MODE (target
);
15023 machine_mode sel_mode
= GET_MODE (sel
);
15024 /* Enforced by the pattern condition. */
15025 int nunits
= GET_MODE_NUNITS (sel_mode
).to_constant ();
15027 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15028 size of the two value vectors, i.e. the upper bits of the indices
15029 are effectively ignored. SVE TBL instead produces 0 for any
15030 out-of-range indices, so we need to modulo all the vec_perm indices
15031 to ensure they are all in range. */
15032 rtx sel_reg
= force_reg (sel_mode
, sel
);
15034 /* Check if the sel only references the first values vector. */
15035 if (GET_CODE (sel
) == CONST_VECTOR
15036 && aarch64_const_vec_all_in_range_p (sel
, 0, nunits
- 1))
15038 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_reg
);
15042 /* Check if the two values vectors are the same. */
15043 if (rtx_equal_p (op0
, op1
))
15045 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
, nunits
- 1);
15046 rtx sel_mod
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15047 NULL
, 0, OPTAB_DIRECT
);
15048 emit_unspec2 (target
, UNSPEC_TBL
, op0
, sel_mod
);
15052 /* Run TBL on for each value vector and combine the results. */
15054 rtx res0
= gen_reg_rtx (data_mode
);
15055 rtx res1
= gen_reg_rtx (data_mode
);
15056 rtx neg_num_elems
= aarch64_simd_gen_const_vector_dup (sel_mode
, -nunits
);
15057 if (GET_CODE (sel
) != CONST_VECTOR
15058 || !aarch64_const_vec_all_in_range_p (sel
, 0, 2 * nunits
- 1))
15060 rtx max_sel
= aarch64_simd_gen_const_vector_dup (sel_mode
,
15062 sel_reg
= expand_simple_binop (sel_mode
, AND
, sel_reg
, max_sel
,
15063 NULL
, 0, OPTAB_DIRECT
);
15065 emit_unspec2 (res0
, UNSPEC_TBL
, op0
, sel_reg
);
15066 rtx sel_sub
= expand_simple_binop (sel_mode
, PLUS
, sel_reg
, neg_num_elems
,
15067 NULL
, 0, OPTAB_DIRECT
);
15068 emit_unspec2 (res1
, UNSPEC_TBL
, op1
, sel_sub
);
15069 if (GET_MODE_CLASS (data_mode
) == MODE_VECTOR_INT
)
15070 emit_insn (gen_rtx_SET (target
, gen_rtx_IOR (data_mode
, res0
, res1
)));
15072 emit_unspec2 (target
, UNSPEC_IORF
, res0
, res1
);
15075 /* Recognize patterns suitable for the TRN instructions. */
15077 aarch64_evpc_trn (struct expand_vec_perm_d
*d
)
15080 poly_uint64 nelt
= d
->perm
.length ();
15081 rtx out
, in0
, in1
, x
;
15082 machine_mode vmode
= d
->vmode
;
15084 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15087 /* Note that these are little-endian tests.
15088 We correct for big-endian later. */
15089 if (!d
->perm
[0].is_constant (&odd
)
15090 || (odd
!= 0 && odd
!= 1)
15091 || !d
->perm
.series_p (0, 2, odd
, 2)
15092 || !d
->perm
.series_p (1, 2, nelt
+ odd
, 2))
15101 /* We don't need a big-endian lane correction for SVE; see the comment
15102 at the head of aarch64-sve.md for details. */
15103 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15105 x
= in0
, in0
= in1
, in1
= x
;
15110 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15111 odd
? UNSPEC_TRN2
: UNSPEC_TRN1
));
15115 /* Recognize patterns suitable for the UZP instructions. */
15117 aarch64_evpc_uzp (struct expand_vec_perm_d
*d
)
15120 rtx out
, in0
, in1
, x
;
15121 machine_mode vmode
= d
->vmode
;
15123 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15126 /* Note that these are little-endian tests.
15127 We correct for big-endian later. */
15128 if (!d
->perm
[0].is_constant (&odd
)
15129 || (odd
!= 0 && odd
!= 1)
15130 || !d
->perm
.series_p (0, 1, odd
, 2))
15139 /* We don't need a big-endian lane correction for SVE; see the comment
15140 at the head of aarch64-sve.md for details. */
15141 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15143 x
= in0
, in0
= in1
, in1
= x
;
15148 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15149 odd
? UNSPEC_UZP2
: UNSPEC_UZP1
));
15153 /* Recognize patterns suitable for the ZIP instructions. */
15155 aarch64_evpc_zip (struct expand_vec_perm_d
*d
)
15158 poly_uint64 nelt
= d
->perm
.length ();
15159 rtx out
, in0
, in1
, x
;
15160 machine_mode vmode
= d
->vmode
;
15162 if (GET_MODE_UNIT_SIZE (vmode
) > 8)
15165 /* Note that these are little-endian tests.
15166 We correct for big-endian later. */
15167 poly_uint64 first
= d
->perm
[0];
15168 if ((maybe_ne (first
, 0U) && maybe_ne (first
* 2, nelt
))
15169 || !d
->perm
.series_p (0, 2, first
, 1)
15170 || !d
->perm
.series_p (1, 2, first
+ nelt
, 1))
15172 high
= maybe_ne (first
, 0U);
15180 /* We don't need a big-endian lane correction for SVE; see the comment
15181 at the head of aarch64-sve.md for details. */
15182 if (BYTES_BIG_ENDIAN
&& d
->vec_flags
== VEC_ADVSIMD
)
15184 x
= in0
, in0
= in1
, in1
= x
;
15189 emit_set_insn (out
, gen_rtx_UNSPEC (vmode
, gen_rtvec (2, in0
, in1
),
15190 high
? UNSPEC_ZIP2
: UNSPEC_ZIP1
));
15194 /* Recognize patterns for the EXT insn. */
15197 aarch64_evpc_ext (struct expand_vec_perm_d
*d
)
15199 HOST_WIDE_INT location
;
15202 /* The first element always refers to the first vector.
15203 Check if the extracted indices are increasing by one. */
15204 if (d
->vec_flags
== VEC_SVE_PRED
15205 || !d
->perm
[0].is_constant (&location
)
15206 || !d
->perm
.series_p (0, 1, location
, 1))
15213 /* The case where (location == 0) is a no-op for both big- and little-endian,
15214 and is removed by the mid-end at optimization levels -O1 and higher.
15216 We don't need a big-endian lane correction for SVE; see the comment
15217 at the head of aarch64-sve.md for details. */
15218 if (BYTES_BIG_ENDIAN
&& location
!= 0 && d
->vec_flags
== VEC_ADVSIMD
)
15220 /* After setup, we want the high elements of the first vector (stored
15221 at the LSB end of the register), and the low elements of the second
15222 vector (stored at the MSB end of the register). So swap. */
15223 std::swap (d
->op0
, d
->op1
);
15224 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15225 to_constant () is safe since this is restricted to Advanced SIMD
15227 location
= d
->perm
.length ().to_constant () - location
;
15230 offset
= GEN_INT (location
);
15231 emit_set_insn (d
->target
,
15232 gen_rtx_UNSPEC (d
->vmode
,
15233 gen_rtvec (3, d
->op0
, d
->op1
, offset
),
15238 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15239 within each 64-bit, 32-bit or 16-bit granule. */
15242 aarch64_evpc_rev_local (struct expand_vec_perm_d
*d
)
15244 HOST_WIDE_INT diff
;
15245 unsigned int i
, size
, unspec
;
15246 machine_mode pred_mode
;
15248 if (d
->vec_flags
== VEC_SVE_PRED
15249 || !d
->one_vector_p
15250 || !d
->perm
[0].is_constant (&diff
))
15253 size
= (diff
+ 1) * GET_MODE_UNIT_SIZE (d
->vmode
);
15256 unspec
= UNSPEC_REV64
;
15257 pred_mode
= VNx2BImode
;
15259 else if (size
== 4)
15261 unspec
= UNSPEC_REV32
;
15262 pred_mode
= VNx4BImode
;
15264 else if (size
== 2)
15266 unspec
= UNSPEC_REV16
;
15267 pred_mode
= VNx8BImode
;
15272 unsigned int step
= diff
+ 1;
15273 for (i
= 0; i
< step
; ++i
)
15274 if (!d
->perm
.series_p (i
, step
, diff
- i
, step
))
15281 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), unspec
);
15282 if (d
->vec_flags
== VEC_SVE_DATA
)
15284 rtx pred
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15285 src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (2, pred
, src
),
15286 UNSPEC_MERGE_PTRUE
);
15288 emit_set_insn (d
->target
, src
);
15292 /* Recognize patterns for the REV insn, which reverses elements within
15296 aarch64_evpc_rev_global (struct expand_vec_perm_d
*d
)
15298 poly_uint64 nelt
= d
->perm
.length ();
15300 if (!d
->one_vector_p
|| d
->vec_flags
!= VEC_SVE_DATA
)
15303 if (!d
->perm
.series_p (0, 1, nelt
- 1, -1))
15310 rtx src
= gen_rtx_UNSPEC (d
->vmode
, gen_rtvec (1, d
->op0
), UNSPEC_REV
);
15311 emit_set_insn (d
->target
, src
);
15316 aarch64_evpc_dup (struct expand_vec_perm_d
*d
)
15318 rtx out
= d
->target
;
15321 machine_mode vmode
= d
->vmode
;
15324 if (d
->vec_flags
== VEC_SVE_PRED
15325 || d
->perm
.encoding ().encoded_nelts () != 1
15326 || !d
->perm
[0].is_constant (&elt
))
15329 if (d
->vec_flags
== VEC_SVE_DATA
&& elt
>= 64 * GET_MODE_UNIT_SIZE (vmode
))
15336 /* The generic preparation in aarch64_expand_vec_perm_const_1
15337 swaps the operand order and the permute indices if it finds
15338 d->perm[0] to be in the second operand. Thus, we can always
15339 use d->op0 and need not do any extra arithmetic to get the
15340 correct lane number. */
15342 lane
= GEN_INT (elt
); /* The pattern corrects for big-endian. */
15344 rtx parallel
= gen_rtx_PARALLEL (vmode
, gen_rtvec (1, lane
));
15345 rtx select
= gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode
), in0
, parallel
);
15346 emit_set_insn (out
, gen_rtx_VEC_DUPLICATE (vmode
, select
));
15351 aarch64_evpc_tbl (struct expand_vec_perm_d
*d
)
15353 rtx rperm
[MAX_COMPILE_TIME_VEC_BYTES
], sel
;
15354 machine_mode vmode
= d
->vmode
;
15356 /* Make sure that the indices are constant. */
15357 unsigned int encoded_nelts
= d
->perm
.encoding ().encoded_nelts ();
15358 for (unsigned int i
= 0; i
< encoded_nelts
; ++i
)
15359 if (!d
->perm
[i
].is_constant ())
15365 /* Generic code will try constant permutation twice. Once with the
15366 original mode and again with the elements lowered to QImode.
15367 So wait and don't do the selector expansion ourselves. */
15368 if (vmode
!= V8QImode
&& vmode
!= V16QImode
)
15371 /* to_constant is safe since this routine is specific to Advanced SIMD
15373 unsigned int nelt
= d
->perm
.length ().to_constant ();
15374 for (unsigned int i
= 0; i
< nelt
; ++i
)
15375 /* If big-endian and two vectors we end up with a weird mixed-endian
15376 mode on NEON. Reverse the index within each word but not the word
15377 itself. to_constant is safe because we checked is_constant above. */
15378 rperm
[i
] = GEN_INT (BYTES_BIG_ENDIAN
15379 ? d
->perm
[i
].to_constant () ^ (nelt
- 1)
15380 : d
->perm
[i
].to_constant ());
15382 sel
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (nelt
, rperm
));
15383 sel
= force_reg (vmode
, sel
);
15385 aarch64_expand_vec_perm_1 (d
->target
, d
->op0
, d
->op1
, sel
);
15389 /* Try to implement D using an SVE TBL instruction. */
15392 aarch64_evpc_sve_tbl (struct expand_vec_perm_d
*d
)
15394 unsigned HOST_WIDE_INT nelt
;
15396 /* Permuting two variable-length vectors could overflow the
15398 if (!d
->one_vector_p
&& !d
->perm
.length ().is_constant (&nelt
))
15404 machine_mode sel_mode
= mode_for_int_vector (d
->vmode
).require ();
15405 rtx sel
= vec_perm_indices_to_rtx (sel_mode
, d
->perm
);
15406 aarch64_expand_sve_vec_perm (d
->target
, d
->op0
, d
->op1
, sel
);
15411 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d
*d
)
15413 /* The pattern matching functions above are written to look for a small
15414 number to begin the sequence (0, 1, N/2). If we begin with an index
15415 from the second operand, we can swap the operands. */
15416 poly_int64 nelt
= d
->perm
.length ();
15417 if (known_ge (d
->perm
[0], nelt
))
15419 d
->perm
.rotate_inputs (1);
15420 std::swap (d
->op0
, d
->op1
);
15423 if ((d
->vec_flags
== VEC_ADVSIMD
15424 || d
->vec_flags
== VEC_SVE_DATA
15425 || d
->vec_flags
== VEC_SVE_PRED
)
15426 && known_gt (nelt
, 1))
15428 if (aarch64_evpc_rev_local (d
))
15430 else if (aarch64_evpc_rev_global (d
))
15432 else if (aarch64_evpc_ext (d
))
15434 else if (aarch64_evpc_dup (d
))
15436 else if (aarch64_evpc_zip (d
))
15438 else if (aarch64_evpc_uzp (d
))
15440 else if (aarch64_evpc_trn (d
))
15442 if (d
->vec_flags
== VEC_SVE_DATA
)
15443 return aarch64_evpc_sve_tbl (d
);
15444 else if (d
->vec_flags
== VEC_SVE_DATA
)
15445 return aarch64_evpc_tbl (d
);
15450 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15453 aarch64_vectorize_vec_perm_const (machine_mode vmode
, rtx target
, rtx op0
,
15454 rtx op1
, const vec_perm_indices
&sel
)
15456 struct expand_vec_perm_d d
;
15458 /* Check whether the mask can be applied to a single vector. */
15459 if (op0
&& rtx_equal_p (op0
, op1
))
15460 d
.one_vector_p
= true;
15461 else if (sel
.all_from_input_p (0))
15463 d
.one_vector_p
= true;
15466 else if (sel
.all_from_input_p (1))
15468 d
.one_vector_p
= true;
15472 d
.one_vector_p
= false;
15474 d
.perm
.new_vector (sel
.encoding (), d
.one_vector_p
? 1 : 2,
15475 sel
.nelts_per_input ());
15477 d
.vec_flags
= aarch64_classify_vector_mode (d
.vmode
);
15481 d
.testing_p
= !target
;
15484 return aarch64_expand_vec_perm_const_1 (&d
);
15486 rtx_insn
*last
= get_last_insn ();
15487 bool ret
= aarch64_expand_vec_perm_const_1 (&d
);
15488 gcc_assert (last
== get_last_insn ());
15493 /* Generate a byte permute mask for a register of mode MODE,
15494 which has NUNITS units. */
15497 aarch64_reverse_mask (machine_mode mode
, unsigned int nunits
)
15499 /* We have to reverse each vector because we dont have
15500 a permuted load that can reverse-load according to ABI rules. */
15502 rtvec v
= rtvec_alloc (16);
15504 unsigned int usize
= GET_MODE_UNIT_SIZE (mode
);
15506 gcc_assert (BYTES_BIG_ENDIAN
);
15507 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode
));
15509 for (i
= 0; i
< nunits
; i
++)
15510 for (j
= 0; j
< usize
; j
++)
15511 RTVEC_ELT (v
, i
* usize
+ j
) = GEN_INT ((i
+ 1) * usize
- 1 - j
);
15512 mask
= gen_rtx_CONST_VECTOR (V16QImode
, v
);
15513 return force_reg (V16QImode
, mask
);
15516 /* Return true if X is a valid second operand for the SVE instruction
15517 that implements integer comparison OP_CODE. */
15520 aarch64_sve_cmp_operand_p (rtx_code op_code
, rtx x
)
15522 if (register_operand (x
, VOIDmode
))
15531 return aarch64_sve_cmp_immediate_p (x
, false);
15538 return aarch64_sve_cmp_immediate_p (x
, true);
15540 gcc_unreachable ();
15544 /* Use predicated SVE instructions to implement the equivalent of:
15548 given that PTRUE is an all-true predicate of the appropriate mode. */
15551 aarch64_emit_sve_ptrue_op (rtx target
, rtx ptrue
, rtx op
)
15553 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15554 gen_rtvec (2, ptrue
, op
),
15555 UNSPEC_MERGE_PTRUE
);
15556 rtx_insn
*insn
= emit_set_insn (target
, unspec
);
15557 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15560 /* Likewise, but also clobber the condition codes. */
15563 aarch64_emit_sve_ptrue_op_cc (rtx target
, rtx ptrue
, rtx op
)
15565 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (target
),
15566 gen_rtvec (2, ptrue
, op
),
15567 UNSPEC_MERGE_PTRUE
);
15568 rtx_insn
*insn
= emit_insn (gen_set_clobber_cc (target
, unspec
));
15569 set_unique_reg_note (insn
, REG_EQUAL
, copy_rtx (op
));
15572 /* Return the UNSPEC_COND_* code for comparison CODE. */
15574 static unsigned int
15575 aarch64_unspec_cond_code (rtx_code code
)
15580 return UNSPEC_COND_NE
;
15582 return UNSPEC_COND_EQ
;
15584 return UNSPEC_COND_LT
;
15586 return UNSPEC_COND_GT
;
15588 return UNSPEC_COND_LE
;
15590 return UNSPEC_COND_GE
;
15592 gcc_unreachable ();
15598 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15600 where <X> is the operation associated with comparison CODE. This form
15601 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15602 semantics, such as when PRED might not be all-true and when comparing
15603 inactive lanes could have side effects. */
15606 aarch64_emit_sve_predicated_cond (rtx target
, rtx_code code
,
15607 rtx pred
, rtx op0
, rtx op1
)
15609 rtx unspec
= gen_rtx_UNSPEC (GET_MODE (pred
),
15610 gen_rtvec (3, pred
, op0
, op1
),
15611 aarch64_unspec_cond_code (code
));
15612 emit_set_insn (target
, unspec
);
15615 /* Expand an SVE integer comparison using the SVE equivalent of:
15617 (set TARGET (CODE OP0 OP1)). */
15620 aarch64_expand_sve_vec_cmp_int (rtx target
, rtx_code code
, rtx op0
, rtx op1
)
15622 machine_mode pred_mode
= GET_MODE (target
);
15623 machine_mode data_mode
= GET_MODE (op0
);
15625 if (!aarch64_sve_cmp_operand_p (code
, op1
))
15626 op1
= force_reg (data_mode
, op1
);
15628 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15629 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15630 aarch64_emit_sve_ptrue_op_cc (target
, ptrue
, cond
);
15633 /* Emit the SVE equivalent of:
15635 (set TMP1 (CODE1 OP0 OP1))
15636 (set TMP2 (CODE2 OP0 OP1))
15637 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15639 PTRUE is an all-true predicate with the same mode as TARGET. */
15642 aarch64_emit_sve_or_conds (rtx target
, rtx_code code1
, rtx_code code2
,
15643 rtx ptrue
, rtx op0
, rtx op1
)
15645 machine_mode pred_mode
= GET_MODE (ptrue
);
15646 rtx tmp1
= gen_reg_rtx (pred_mode
);
15647 aarch64_emit_sve_ptrue_op (tmp1
, ptrue
,
15648 gen_rtx_fmt_ee (code1
, pred_mode
, op0
, op1
));
15649 rtx tmp2
= gen_reg_rtx (pred_mode
);
15650 aarch64_emit_sve_ptrue_op (tmp2
, ptrue
,
15651 gen_rtx_fmt_ee (code2
, pred_mode
, op0
, op1
));
15652 aarch64_emit_binop (target
, ior_optab
, tmp1
, tmp2
);
15655 /* Emit the SVE equivalent of:
15657 (set TMP (CODE OP0 OP1))
15658 (set TARGET (not TMP))
15660 PTRUE is an all-true predicate with the same mode as TARGET. */
15663 aarch64_emit_sve_inverted_cond (rtx target
, rtx ptrue
, rtx_code code
,
15666 machine_mode pred_mode
= GET_MODE (ptrue
);
15667 rtx tmp
= gen_reg_rtx (pred_mode
);
15668 aarch64_emit_sve_ptrue_op (tmp
, ptrue
,
15669 gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
));
15670 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15673 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15675 (set TARGET (CODE OP0 OP1))
15677 If CAN_INVERT_P is true, the caller can also handle inverted results;
15678 return true if the result is in fact inverted. */
15681 aarch64_expand_sve_vec_cmp_float (rtx target
, rtx_code code
,
15682 rtx op0
, rtx op1
, bool can_invert_p
)
15684 machine_mode pred_mode
= GET_MODE (target
);
15685 machine_mode data_mode
= GET_MODE (op0
);
15687 rtx ptrue
= force_reg (pred_mode
, CONSTM1_RTX (pred_mode
));
15691 /* UNORDERED has no immediate form. */
15692 op1
= force_reg (data_mode
, op1
);
15701 /* There is native support for the comparison. */
15702 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15703 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15708 /* This is a trapping operation (LT or GT). */
15709 aarch64_emit_sve_or_conds (target
, LT
, GT
, ptrue
, op0
, op1
);
15713 if (!flag_trapping_math
)
15715 /* This would trap for signaling NaNs. */
15716 op1
= force_reg (data_mode
, op1
);
15717 aarch64_emit_sve_or_conds (target
, UNORDERED
, EQ
, ptrue
, op0
, op1
);
15725 if (flag_trapping_math
)
15727 /* Work out which elements are ordered. */
15728 rtx ordered
= gen_reg_rtx (pred_mode
);
15729 op1
= force_reg (data_mode
, op1
);
15730 aarch64_emit_sve_inverted_cond (ordered
, ptrue
, UNORDERED
, op0
, op1
);
15732 /* Test the opposite condition for the ordered elements,
15733 then invert the result. */
15737 code
= reverse_condition_maybe_unordered (code
);
15740 aarch64_emit_sve_predicated_cond (target
, code
,
15741 ordered
, op0
, op1
);
15744 rtx tmp
= gen_reg_rtx (pred_mode
);
15745 aarch64_emit_sve_predicated_cond (tmp
, code
, ordered
, op0
, op1
);
15746 aarch64_emit_unop (target
, one_cmpl_optab
, tmp
);
15752 /* ORDERED has no immediate form. */
15753 op1
= force_reg (data_mode
, op1
);
15757 gcc_unreachable ();
15760 /* There is native support for the inverse comparison. */
15761 code
= reverse_condition_maybe_unordered (code
);
15764 rtx cond
= gen_rtx_fmt_ee (code
, pred_mode
, op0
, op1
);
15765 aarch64_emit_sve_ptrue_op (target
, ptrue
, cond
);
15768 aarch64_emit_sve_inverted_cond (target
, ptrue
, code
, op0
, op1
);
15772 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15773 of the data being selected and CMP_MODE is the mode of the values being
15777 aarch64_expand_sve_vcond (machine_mode data_mode
, machine_mode cmp_mode
,
15780 machine_mode pred_mode
15781 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode
),
15782 GET_MODE_SIZE (cmp_mode
)).require ();
15783 rtx pred
= gen_reg_rtx (pred_mode
);
15784 if (FLOAT_MODE_P (cmp_mode
))
15786 if (aarch64_expand_sve_vec_cmp_float (pred
, GET_CODE (ops
[3]),
15787 ops
[4], ops
[5], true))
15788 std::swap (ops
[1], ops
[2]);
15791 aarch64_expand_sve_vec_cmp_int (pred
, GET_CODE (ops
[3]), ops
[4], ops
[5]);
15793 rtvec vec
= gen_rtvec (3, pred
, ops
[1], ops
[2]);
15794 emit_set_insn (ops
[0], gen_rtx_UNSPEC (data_mode
, vec
, UNSPEC_SEL
));
15797 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15798 true. However due to issues with register allocation it is preferable
15799 to avoid tieing integer scalar and FP scalar modes. Executing integer
15800 operations in general registers is better than treating them as scalar
15801 vector operations. This reduces latency and avoids redundant int<->FP
15802 moves. So tie modes if they are either the same class, or vector modes
15803 with other vector modes, vector structs or any scalar mode. */
15806 aarch64_modes_tieable_p (machine_mode mode1
, machine_mode mode2
)
15808 if (GET_MODE_CLASS (mode1
) == GET_MODE_CLASS (mode2
))
15811 /* We specifically want to allow elements of "structure" modes to
15812 be tieable to the structure. This more general condition allows
15813 other rarer situations too. The reason we don't extend this to
15814 predicate modes is that there are no predicate structure modes
15815 nor any specific instructions for extracting part of a predicate
15817 if (aarch64_vector_data_mode_p (mode1
)
15818 && aarch64_vector_data_mode_p (mode2
))
15821 /* Also allow any scalar modes with vectors. */
15822 if (aarch64_vector_mode_supported_p (mode1
)
15823 || aarch64_vector_mode_supported_p (mode2
))
15829 /* Return a new RTX holding the result of moving POINTER forward by
15833 aarch64_move_pointer (rtx pointer
, poly_int64 amount
)
15835 rtx next
= plus_constant (Pmode
, XEXP (pointer
, 0), amount
);
15837 return adjust_automodify_address (pointer
, GET_MODE (pointer
),
15841 /* Return a new RTX holding the result of moving POINTER forward by the
15842 size of the mode it points to. */
15845 aarch64_progress_pointer (rtx pointer
)
15847 return aarch64_move_pointer (pointer
, GET_MODE_SIZE (GET_MODE (pointer
)));
15850 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15854 aarch64_copy_one_block_and_progress_pointers (rtx
*src
, rtx
*dst
,
15857 rtx reg
= gen_reg_rtx (mode
);
15859 /* "Cast" the pointers to the correct mode. */
15860 *src
= adjust_address (*src
, mode
, 0);
15861 *dst
= adjust_address (*dst
, mode
, 0);
15862 /* Emit the memcpy. */
15863 emit_move_insn (reg
, *src
);
15864 emit_move_insn (*dst
, reg
);
15865 /* Move the pointers forward. */
15866 *src
= aarch64_progress_pointer (*src
);
15867 *dst
= aarch64_progress_pointer (*dst
);
15870 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15871 we succeed, otherwise return false. */
15874 aarch64_expand_movmem (rtx
*operands
)
15877 rtx dst
= operands
[0];
15878 rtx src
= operands
[1];
15880 machine_mode cur_mode
= BLKmode
, next_mode
;
15881 bool speed_p
= !optimize_function_for_size_p (cfun
);
15883 /* When optimizing for size, give a better estimate of the length of a
15884 memcpy call, but use the default otherwise. Moves larger than 8 bytes
15885 will always require an even number of instructions to do now. And each
15886 operation requires both a load+store, so devide the max number by 2. */
15887 int max_num_moves
= (speed_p
? 16 : AARCH64_CALL_RATIO
) / 2;
15889 /* We can't do anything smart if the amount to copy is not constant. */
15890 if (!CONST_INT_P (operands
[2]))
15893 n
= INTVAL (operands
[2]);
15895 /* Try to keep the number of instructions low. For all cases we will do at
15896 most two moves for the residual amount, since we'll always overlap the
15898 if (((n
/ 16) + (n
% 16 ? 2 : 0)) > max_num_moves
)
15901 base
= copy_to_mode_reg (Pmode
, XEXP (dst
, 0));
15902 dst
= adjust_automodify_address (dst
, VOIDmode
, base
, 0);
15904 base
= copy_to_mode_reg (Pmode
, XEXP (src
, 0));
15905 src
= adjust_automodify_address (src
, VOIDmode
, base
, 0);
15907 /* Convert n to bits to make the rest of the code simpler. */
15908 n
= n
* BITS_PER_UNIT
;
15912 /* Find the largest mode in which to do the copy in without over reading
15914 opt_scalar_int_mode mode_iter
;
15915 FOR_EACH_MODE_IN_CLASS (mode_iter
, MODE_INT
)
15916 if (GET_MODE_BITSIZE (mode_iter
.require ()) <= n
)
15917 cur_mode
= mode_iter
.require ();
15919 gcc_assert (cur_mode
!= BLKmode
);
15921 mode_bits
= GET_MODE_BITSIZE (cur_mode
).to_constant ();
15922 aarch64_copy_one_block_and_progress_pointers (&src
, &dst
, cur_mode
);
15926 /* Do certain trailing copies as overlapping if it's going to be
15927 cheaper. i.e. less instructions to do so. For instance doing a 15
15928 byte copy it's more efficient to do two overlapping 8 byte copies than
15930 next_mode
= smallest_mode_for_size (n
, MODE_INT
);
15931 int n_bits
= GET_MODE_BITSIZE (next_mode
).to_constant ();
15932 if (n
> 0 && n_bits
> n
&& n_bits
<= 8 * BITS_PER_UNIT
)
15934 src
= aarch64_move_pointer (src
, (n
- n_bits
) / BITS_PER_UNIT
);
15935 dst
= aarch64_move_pointer (dst
, (n
- n_bits
) / BITS_PER_UNIT
);
15943 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15944 SImode stores. Handle the case when the constant has identical
15945 bottom and top halves. This is beneficial when the two stores can be
15946 merged into an STP and we avoid synthesising potentially expensive
15947 immediates twice. Return true if such a split is possible. */
15950 aarch64_split_dimode_const_store (rtx dst
, rtx src
)
15952 rtx lo
= gen_lowpart (SImode
, src
);
15953 rtx hi
= gen_highpart_mode (SImode
, DImode
, src
);
15955 bool size_p
= optimize_function_for_size_p (cfun
);
15957 if (!rtx_equal_p (lo
, hi
))
15960 unsigned int orig_cost
15961 = aarch64_internal_mov_immediate (NULL_RTX
, src
, false, DImode
);
15962 unsigned int lo_cost
15963 = aarch64_internal_mov_immediate (NULL_RTX
, lo
, false, SImode
);
15965 /* We want to transform:
15967 MOVK x1, 0x140, lsl 16
15968 MOVK x1, 0xc0da, lsl 32
15969 MOVK x1, 0x140, lsl 48
15973 MOVK w1, 0x140, lsl 16
15975 So we want to perform this only when we save two instructions
15976 or more. When optimizing for size, however, accept any code size
15978 if (size_p
&& orig_cost
<= lo_cost
)
15982 && (orig_cost
<= lo_cost
+ 1))
15985 rtx mem_lo
= adjust_address (dst
, SImode
, 0);
15986 if (!aarch64_mem_pair_operand (mem_lo
, SImode
))
15989 rtx tmp_reg
= gen_reg_rtx (SImode
);
15990 aarch64_expand_mov_immediate (tmp_reg
, lo
);
15991 rtx mem_hi
= aarch64_move_pointer (mem_lo
, GET_MODE_SIZE (SImode
));
15992 /* Don't emit an explicit store pair as this may not be always profitable.
15993 Let the sched-fusion logic decide whether to merge them. */
15994 emit_move_insn (mem_lo
, tmp_reg
);
15995 emit_move_insn (mem_hi
, tmp_reg
);
16000 /* Generate RTL for a conditional branch with rtx comparison CODE in
16001 mode CC_MODE. The destination of the unlikely conditional branch
16005 aarch64_gen_unlikely_cbranch (enum rtx_code code
, machine_mode cc_mode
,
16009 x
= gen_rtx_fmt_ee (code
, VOIDmode
,
16010 gen_rtx_REG (cc_mode
, CC_REGNUM
),
16013 x
= gen_rtx_IF_THEN_ELSE (VOIDmode
, x
,
16014 gen_rtx_LABEL_REF (VOIDmode
, label_ref
),
16016 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx
, x
));
16019 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16021 OP1 represents the TImode destination operand 1
16022 OP2 represents the TImode destination operand 2
16023 LOW_DEST represents the low half (DImode) of TImode operand 0
16024 LOW_IN1 represents the low half (DImode) of TImode operand 1
16025 LOW_IN2 represents the low half (DImode) of TImode operand 2
16026 HIGH_DEST represents the high half (DImode) of TImode operand 0
16027 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16028 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16031 aarch64_addti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16032 rtx
*low_in1
, rtx
*low_in2
,
16033 rtx
*high_dest
, rtx
*high_in1
,
16036 *low_dest
= gen_reg_rtx (DImode
);
16037 *low_in1
= gen_lowpart (DImode
, op1
);
16038 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16039 subreg_lowpart_offset (DImode
, TImode
));
16040 *high_dest
= gen_reg_rtx (DImode
);
16041 *high_in1
= gen_highpart (DImode
, op1
);
16042 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16043 subreg_highpart_offset (DImode
, TImode
));
16046 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16048 This function differs from 'arch64_addti_scratch_regs' in that
16049 OP1 can be an immediate constant (zero). We must call
16050 subreg_highpart_offset with DImode and TImode arguments, otherwise
16051 VOIDmode will be used for the const_int which generates an internal
16052 error from subreg_size_highpart_offset which does not expect a size of zero.
16054 OP1 represents the TImode destination operand 1
16055 OP2 represents the TImode destination operand 2
16056 LOW_DEST represents the low half (DImode) of TImode operand 0
16057 LOW_IN1 represents the low half (DImode) of TImode operand 1
16058 LOW_IN2 represents the low half (DImode) of TImode operand 2
16059 HIGH_DEST represents the high half (DImode) of TImode operand 0
16060 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16061 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16065 aarch64_subvti_scratch_regs (rtx op1
, rtx op2
, rtx
*low_dest
,
16066 rtx
*low_in1
, rtx
*low_in2
,
16067 rtx
*high_dest
, rtx
*high_in1
,
16070 *low_dest
= gen_reg_rtx (DImode
);
16071 *low_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16072 subreg_lowpart_offset (DImode
, TImode
));
16074 *low_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16075 subreg_lowpart_offset (DImode
, TImode
));
16076 *high_dest
= gen_reg_rtx (DImode
);
16078 *high_in1
= simplify_gen_subreg (DImode
, op1
, TImode
,
16079 subreg_highpart_offset (DImode
, TImode
));
16080 *high_in2
= simplify_gen_subreg (DImode
, op2
, TImode
,
16081 subreg_highpart_offset (DImode
, TImode
));
16084 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16086 OP0 represents the TImode destination operand 0
16087 LOW_DEST represents the low half (DImode) of TImode operand 0
16088 LOW_IN1 represents the low half (DImode) of TImode operand 1
16089 LOW_IN2 represents the low half (DImode) of TImode operand 2
16090 HIGH_DEST represents the high half (DImode) of TImode operand 0
16091 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16092 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16095 aarch64_expand_subvti (rtx op0
, rtx low_dest
, rtx low_in1
,
16096 rtx low_in2
, rtx high_dest
, rtx high_in1
,
16099 if (low_in2
== const0_rtx
)
16101 low_dest
= low_in1
;
16102 emit_insn (gen_subdi3_compare1 (high_dest
, high_in1
,
16103 force_reg (DImode
, high_in2
)));
16107 if (CONST_INT_P (low_in2
))
16109 low_in2
= force_reg (DImode
, GEN_INT (-UINTVAL (low_in2
)));
16110 high_in2
= force_reg (DImode
, high_in2
);
16111 emit_insn (gen_adddi3_compareC (low_dest
, low_in1
, low_in2
));
16114 emit_insn (gen_subdi3_compare1 (low_dest
, low_in1
, low_in2
));
16115 emit_insn (gen_subdi3_carryinCV (high_dest
,
16116 force_reg (DImode
, high_in1
),
16120 emit_move_insn (gen_lowpart (DImode
, op0
), low_dest
);
16121 emit_move_insn (gen_highpart (DImode
, op0
), high_dest
);
16125 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16127 static unsigned HOST_WIDE_INT
16128 aarch64_asan_shadow_offset (void)
16130 return (HOST_WIDE_INT_1
<< 36);
16134 aarch64_gen_ccmp_first (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
,
16135 int code
, tree treeop0
, tree treeop1
)
16137 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16139 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16141 struct expand_operand ops
[4];
16144 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16146 op_mode
= GET_MODE (op0
);
16147 if (op_mode
== VOIDmode
)
16148 op_mode
= GET_MODE (op1
);
16156 icode
= CODE_FOR_cmpsi
;
16161 icode
= CODE_FOR_cmpdi
;
16166 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16167 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpesf
: CODE_FOR_fcmpsf
;
16172 cc_mode
= aarch64_select_cc_mode ((rtx_code
) code
, op0
, op1
);
16173 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fcmpedf
: CODE_FOR_fcmpdf
;
16181 op0
= prepare_operand (icode
, op0
, 0, op_mode
, cmp_mode
, unsignedp
);
16182 op1
= prepare_operand (icode
, op1
, 1, op_mode
, cmp_mode
, unsignedp
);
16188 *prep_seq
= get_insns ();
16191 create_fixed_operand (&ops
[0], op0
);
16192 create_fixed_operand (&ops
[1], op1
);
16195 if (!maybe_expand_insn (icode
, 2, ops
))
16200 *gen_seq
= get_insns ();
16203 return gen_rtx_fmt_ee ((rtx_code
) code
, cc_mode
,
16204 gen_rtx_REG (cc_mode
, CC_REGNUM
), const0_rtx
);
16208 aarch64_gen_ccmp_next (rtx_insn
**prep_seq
, rtx_insn
**gen_seq
, rtx prev
,
16209 int cmp_code
, tree treeop0
, tree treeop1
, int bit_code
)
16211 rtx op0
, op1
, target
;
16212 machine_mode op_mode
, cmp_mode
, cc_mode
= CCmode
;
16213 int unsignedp
= TYPE_UNSIGNED (TREE_TYPE (treeop0
));
16215 struct expand_operand ops
[6];
16218 push_to_sequence (*prep_seq
);
16219 expand_operands (treeop0
, treeop1
, NULL_RTX
, &op0
, &op1
, EXPAND_NORMAL
);
16221 op_mode
= GET_MODE (op0
);
16222 if (op_mode
== VOIDmode
)
16223 op_mode
= GET_MODE (op1
);
16231 icode
= CODE_FOR_ccmpsi
;
16236 icode
= CODE_FOR_ccmpdi
;
16241 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16242 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpesf
: CODE_FOR_fccmpsf
;
16247 cc_mode
= aarch64_select_cc_mode ((rtx_code
) cmp_code
, op0
, op1
);
16248 icode
= cc_mode
== CCFPEmode
? CODE_FOR_fccmpedf
: CODE_FOR_fccmpdf
;
16256 op0
= prepare_operand (icode
, op0
, 2, op_mode
, cmp_mode
, unsignedp
);
16257 op1
= prepare_operand (icode
, op1
, 3, op_mode
, cmp_mode
, unsignedp
);
16263 *prep_seq
= get_insns ();
16266 target
= gen_rtx_REG (cc_mode
, CC_REGNUM
);
16267 aarch64_cond
= aarch64_get_condition_code_1 (cc_mode
, (rtx_code
) cmp_code
);
16269 if (bit_code
!= AND
)
16271 prev
= gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev
),
16272 GET_MODE (XEXP (prev
, 0))),
16273 VOIDmode
, XEXP (prev
, 0), const0_rtx
);
16274 aarch64_cond
= AARCH64_INVERSE_CONDITION_CODE (aarch64_cond
);
16277 create_fixed_operand (&ops
[0], XEXP (prev
, 0));
16278 create_fixed_operand (&ops
[1], target
);
16279 create_fixed_operand (&ops
[2], op0
);
16280 create_fixed_operand (&ops
[3], op1
);
16281 create_fixed_operand (&ops
[4], prev
);
16282 create_fixed_operand (&ops
[5], GEN_INT (aarch64_cond
));
16284 push_to_sequence (*gen_seq
);
16285 if (!maybe_expand_insn (icode
, 6, ops
))
16291 *gen_seq
= get_insns ();
16294 return gen_rtx_fmt_ee ((rtx_code
) cmp_code
, VOIDmode
, target
, const0_rtx
);
16297 #undef TARGET_GEN_CCMP_FIRST
16298 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16300 #undef TARGET_GEN_CCMP_NEXT
16301 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16303 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16304 instruction fusion of some sort. */
16307 aarch64_macro_fusion_p (void)
16309 return aarch64_tune_params
.fusible_ops
!= AARCH64_FUSE_NOTHING
;
16313 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16314 should be kept together during scheduling. */
16317 aarch_macro_fusion_pair_p (rtx_insn
*prev
, rtx_insn
*curr
)
16320 rtx prev_set
= single_set (prev
);
16321 rtx curr_set
= single_set (curr
);
16322 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16323 bool simple_sets_p
= prev_set
&& curr_set
&& !any_condjump_p (curr
);
16325 if (!aarch64_macro_fusion_p ())
16328 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK
))
16330 /* We are trying to match:
16331 prev (mov) == (set (reg r0) (const_int imm16))
16332 curr (movk) == (set (zero_extract (reg r0)
16335 (const_int imm16_1)) */
16337 set_dest
= SET_DEST (curr_set
);
16339 if (GET_CODE (set_dest
) == ZERO_EXTRACT
16340 && CONST_INT_P (SET_SRC (curr_set
))
16341 && CONST_INT_P (SET_SRC (prev_set
))
16342 && CONST_INT_P (XEXP (set_dest
, 2))
16343 && INTVAL (XEXP (set_dest
, 2)) == 16
16344 && REG_P (XEXP (set_dest
, 0))
16345 && REG_P (SET_DEST (prev_set
))
16346 && REGNO (XEXP (set_dest
, 0)) == REGNO (SET_DEST (prev_set
)))
16352 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD
))
16355 /* We're trying to match:
16356 prev (adrp) == (set (reg r1)
16357 (high (symbol_ref ("SYM"))))
16358 curr (add) == (set (reg r0)
16360 (symbol_ref ("SYM"))))
16361 Note that r0 need not necessarily be the same as r1, especially
16362 during pre-regalloc scheduling. */
16364 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16365 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16367 if (GET_CODE (SET_SRC (curr_set
)) == LO_SUM
16368 && REG_P (XEXP (SET_SRC (curr_set
), 0))
16369 && REGNO (XEXP (SET_SRC (curr_set
), 0))
16370 == REGNO (SET_DEST (prev_set
))
16371 && rtx_equal_p (XEXP (SET_SRC (prev_set
), 0),
16372 XEXP (SET_SRC (curr_set
), 1)))
16377 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK
))
16380 /* We're trying to match:
16381 prev (movk) == (set (zero_extract (reg r0)
16384 (const_int imm16_1))
16385 curr (movk) == (set (zero_extract (reg r0)
16388 (const_int imm16_2)) */
16390 if (GET_CODE (SET_DEST (prev_set
)) == ZERO_EXTRACT
16391 && GET_CODE (SET_DEST (curr_set
)) == ZERO_EXTRACT
16392 && REG_P (XEXP (SET_DEST (prev_set
), 0))
16393 && REG_P (XEXP (SET_DEST (curr_set
), 0))
16394 && REGNO (XEXP (SET_DEST (prev_set
), 0))
16395 == REGNO (XEXP (SET_DEST (curr_set
), 0))
16396 && CONST_INT_P (XEXP (SET_DEST (prev_set
), 2))
16397 && CONST_INT_P (XEXP (SET_DEST (curr_set
), 2))
16398 && INTVAL (XEXP (SET_DEST (prev_set
), 2)) == 32
16399 && INTVAL (XEXP (SET_DEST (curr_set
), 2)) == 48
16400 && CONST_INT_P (SET_SRC (prev_set
))
16401 && CONST_INT_P (SET_SRC (curr_set
)))
16405 if (simple_sets_p
&& aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR
))
16407 /* We're trying to match:
16408 prev (adrp) == (set (reg r0)
16409 (high (symbol_ref ("SYM"))))
16410 curr (ldr) == (set (reg r1)
16411 (mem (lo_sum (reg r0)
16412 (symbol_ref ("SYM")))))
16414 curr (ldr) == (set (reg r1)
16417 (symbol_ref ("SYM")))))) */
16418 if (satisfies_constraint_Ush (SET_SRC (prev_set
))
16419 && REG_P (SET_DEST (prev_set
)) && REG_P (SET_DEST (curr_set
)))
16421 rtx curr_src
= SET_SRC (curr_set
);
16423 if (GET_CODE (curr_src
) == ZERO_EXTEND
)
16424 curr_src
= XEXP (curr_src
, 0);
16426 if (MEM_P (curr_src
) && GET_CODE (XEXP (curr_src
, 0)) == LO_SUM
16427 && REG_P (XEXP (XEXP (curr_src
, 0), 0))
16428 && REGNO (XEXP (XEXP (curr_src
, 0), 0))
16429 == REGNO (SET_DEST (prev_set
))
16430 && rtx_equal_p (XEXP (XEXP (curr_src
, 0), 1),
16431 XEXP (SET_SRC (prev_set
), 0)))
16436 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC
)
16437 && aarch_crypto_can_dual_issue (prev
, curr
))
16440 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH
)
16441 && any_condjump_p (curr
))
16443 enum attr_type prev_type
= get_attr_type (prev
);
16445 unsigned int condreg1
, condreg2
;
16447 aarch64_fixed_condition_code_regs (&condreg1
, &condreg2
);
16448 cc_reg_1
= gen_rtx_REG (CCmode
, condreg1
);
16450 if (reg_referenced_p (cc_reg_1
, PATTERN (curr
))
16452 && modified_in_p (cc_reg_1
, prev
))
16454 /* FIXME: this misses some which is considered simple arthematic
16455 instructions for ThunderX. Simple shifts are missed here. */
16456 if (prev_type
== TYPE_ALUS_SREG
16457 || prev_type
== TYPE_ALUS_IMM
16458 || prev_type
== TYPE_LOGICS_REG
16459 || prev_type
== TYPE_LOGICS_IMM
)
16466 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH
)
16467 && any_condjump_p (curr
))
16469 /* We're trying to match:
16470 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16471 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16473 (label_ref ("SYM"))
16475 if (SET_DEST (curr_set
) == (pc_rtx
)
16476 && GET_CODE (SET_SRC (curr_set
)) == IF_THEN_ELSE
16477 && REG_P (XEXP (XEXP (SET_SRC (curr_set
), 0), 0))
16478 && REG_P (SET_DEST (prev_set
))
16479 && REGNO (SET_DEST (prev_set
))
16480 == REGNO (XEXP (XEXP (SET_SRC (curr_set
), 0), 0)))
16482 /* Fuse ALU operations followed by conditional branch instruction. */
16483 switch (get_attr_type (prev
))
16486 case TYPE_ALU_SREG
:
16489 case TYPE_ADCS_REG
:
16490 case TYPE_ADCS_IMM
:
16491 case TYPE_LOGIC_REG
:
16492 case TYPE_LOGIC_IMM
:
16496 case TYPE_SHIFT_REG
:
16497 case TYPE_SHIFT_IMM
:
16512 /* Return true iff the instruction fusion described by OP is enabled. */
16515 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op
)
16517 return (aarch64_tune_params
.fusible_ops
& op
) != 0;
16520 /* If MEM is in the form of [base+offset], extract the two parts
16521 of address and set to BASE and OFFSET, otherwise return false
16522 after clearing BASE and OFFSET. */
16525 extract_base_offset_in_addr (rtx mem
, rtx
*base
, rtx
*offset
)
16529 gcc_assert (MEM_P (mem
));
16531 addr
= XEXP (mem
, 0);
16536 *offset
= const0_rtx
;
16540 if (GET_CODE (addr
) == PLUS
16541 && REG_P (XEXP (addr
, 0)) && CONST_INT_P (XEXP (addr
, 1)))
16543 *base
= XEXP (addr
, 0);
16544 *offset
= XEXP (addr
, 1);
16549 *offset
= NULL_RTX
;
16554 /* Types for scheduling fusion. */
16555 enum sched_fusion_type
16557 SCHED_FUSION_NONE
= 0,
16558 SCHED_FUSION_LD_SIGN_EXTEND
,
16559 SCHED_FUSION_LD_ZERO_EXTEND
,
16565 /* If INSN is a load or store of address in the form of [base+offset],
16566 extract the two parts and set to BASE and OFFSET. Return scheduling
16567 fusion type this INSN is. */
16569 static enum sched_fusion_type
16570 fusion_load_store (rtx_insn
*insn
, rtx
*base
, rtx
*offset
)
16573 enum sched_fusion_type fusion
= SCHED_FUSION_LD
;
16575 gcc_assert (INSN_P (insn
));
16576 x
= PATTERN (insn
);
16577 if (GET_CODE (x
) != SET
)
16578 return SCHED_FUSION_NONE
;
16581 dest
= SET_DEST (x
);
16583 machine_mode dest_mode
= GET_MODE (dest
);
16585 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode
))
16586 return SCHED_FUSION_NONE
;
16588 if (GET_CODE (src
) == SIGN_EXTEND
)
16590 fusion
= SCHED_FUSION_LD_SIGN_EXTEND
;
16591 src
= XEXP (src
, 0);
16592 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16593 return SCHED_FUSION_NONE
;
16595 else if (GET_CODE (src
) == ZERO_EXTEND
)
16597 fusion
= SCHED_FUSION_LD_ZERO_EXTEND
;
16598 src
= XEXP (src
, 0);
16599 if (GET_CODE (src
) != MEM
|| GET_MODE (src
) != SImode
)
16600 return SCHED_FUSION_NONE
;
16603 if (GET_CODE (src
) == MEM
&& REG_P (dest
))
16604 extract_base_offset_in_addr (src
, base
, offset
);
16605 else if (GET_CODE (dest
) == MEM
&& (REG_P (src
) || src
== const0_rtx
))
16607 fusion
= SCHED_FUSION_ST
;
16608 extract_base_offset_in_addr (dest
, base
, offset
);
16611 return SCHED_FUSION_NONE
;
16613 if (*base
== NULL_RTX
|| *offset
== NULL_RTX
)
16614 fusion
= SCHED_FUSION_NONE
;
16619 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16621 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16622 and PRI are only calculated for these instructions. For other instruction,
16623 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16624 type instruction fusion can be added by returning different priorities.
16626 It's important that irrelevant instructions get the largest FUSION_PRI. */
16629 aarch64_sched_fusion_priority (rtx_insn
*insn
, int max_pri
,
16630 int *fusion_pri
, int *pri
)
16634 enum sched_fusion_type fusion
;
16636 gcc_assert (INSN_P (insn
));
16639 fusion
= fusion_load_store (insn
, &base
, &offset
);
16640 if (fusion
== SCHED_FUSION_NONE
)
16647 /* Set FUSION_PRI according to fusion type and base register. */
16648 *fusion_pri
= tmp
- fusion
* FIRST_PSEUDO_REGISTER
- REGNO (base
);
16650 /* Calculate PRI. */
16653 /* INSN with smaller offset goes first. */
16654 off_val
= (int)(INTVAL (offset
));
16656 tmp
-= (off_val
& 0xfffff);
16658 tmp
+= ((- off_val
) & 0xfffff);
16664 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16665 Adjust priority of sha1h instructions so they are scheduled before
16666 other SHA1 instructions. */
16669 aarch64_sched_adjust_priority (rtx_insn
*insn
, int priority
)
16671 rtx x
= PATTERN (insn
);
16673 if (GET_CODE (x
) == SET
)
16677 if (GET_CODE (x
) == UNSPEC
&& XINT (x
, 1) == UNSPEC_SHA1H
)
16678 return priority
+ 10;
16684 /* Given OPERANDS of consecutive load/store, check if we can merge
16685 them into ldp/stp. LOAD is true if they are load instructions.
16686 MODE is the mode of memory operands. */
16689 aarch64_operands_ok_for_ldpstp (rtx
*operands
, bool load
,
16692 HOST_WIDE_INT offval_1
, offval_2
, msize
;
16693 enum reg_class rclass_1
, rclass_2
;
16694 rtx mem_1
, mem_2
, reg_1
, reg_2
, base_1
, base_2
, offset_1
, offset_2
;
16698 mem_1
= operands
[1];
16699 mem_2
= operands
[3];
16700 reg_1
= operands
[0];
16701 reg_2
= operands
[2];
16702 gcc_assert (REG_P (reg_1
) && REG_P (reg_2
));
16703 if (REGNO (reg_1
) == REGNO (reg_2
))
16708 mem_1
= operands
[0];
16709 mem_2
= operands
[2];
16710 reg_1
= operands
[1];
16711 reg_2
= operands
[3];
16714 /* The mems cannot be volatile. */
16715 if (MEM_VOLATILE_P (mem_1
) || MEM_VOLATILE_P (mem_2
))
16718 /* If we have SImode and slow unaligned ldp,
16719 check the alignment to be at least 8 byte. */
16721 && (aarch64_tune_params
.extra_tuning_flags
16722 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16724 && MEM_ALIGN (mem_1
) < 8 * BITS_PER_UNIT
)
16727 /* Check if the addresses are in the form of [base+offset]. */
16728 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16729 if (base_1
== NULL_RTX
|| offset_1
== NULL_RTX
)
16731 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16732 if (base_2
== NULL_RTX
|| offset_2
== NULL_RTX
)
16735 /* Check if the bases are same. */
16736 if (!rtx_equal_p (base_1
, base_2
))
16739 /* The operands must be of the same size. */
16740 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1
)),
16741 GET_MODE_SIZE (GET_MODE (mem_2
))));
16743 offval_1
= INTVAL (offset_1
);
16744 offval_2
= INTVAL (offset_2
);
16745 /* We should only be trying this for fixed-sized modes. There is no
16746 SVE LDP/STP instruction. */
16747 msize
= GET_MODE_SIZE (mode
).to_constant ();
16748 /* Check if the offsets are consecutive. */
16749 if (offval_1
!= (offval_2
+ msize
) && offval_2
!= (offval_1
+ msize
))
16752 /* Check if the addresses are clobbered by load. */
16755 if (reg_mentioned_p (reg_1
, mem_1
))
16758 /* In increasing order, the last load can clobber the address. */
16759 if (offval_1
> offval_2
&& reg_mentioned_p (reg_2
, mem_2
))
16763 /* One of the memory accesses must be a mempair operand.
16764 If it is not the first one, they need to be swapped by the
16766 if (!aarch64_mem_pair_operand (mem_1
, GET_MODE (mem_1
))
16767 && !aarch64_mem_pair_operand (mem_2
, GET_MODE (mem_2
)))
16770 if (REG_P (reg_1
) && FP_REGNUM_P (REGNO (reg_1
)))
16771 rclass_1
= FP_REGS
;
16773 rclass_1
= GENERAL_REGS
;
16775 if (REG_P (reg_2
) && FP_REGNUM_P (REGNO (reg_2
)))
16776 rclass_2
= FP_REGS
;
16778 rclass_2
= GENERAL_REGS
;
16780 /* Check if the registers are of same class. */
16781 if (rclass_1
!= rclass_2
)
16787 /* Given OPERANDS of consecutive load/store that can be merged,
16788 swap them if they are not in ascending order. */
16790 aarch64_swap_ldrstr_operands (rtx
* operands
, bool load
)
16792 rtx mem_1
, mem_2
, base_1
, base_2
, offset_1
, offset_2
;
16793 HOST_WIDE_INT offval_1
, offval_2
;
16797 mem_1
= operands
[1];
16798 mem_2
= operands
[3];
16802 mem_1
= operands
[0];
16803 mem_2
= operands
[2];
16806 extract_base_offset_in_addr (mem_1
, &base_1
, &offset_1
);
16807 extract_base_offset_in_addr (mem_2
, &base_2
, &offset_2
);
16809 offval_1
= INTVAL (offset_1
);
16810 offval_2
= INTVAL (offset_2
);
16812 if (offval_1
> offval_2
)
16814 /* Irrespective of whether this is a load or a store,
16815 we do the same swap. */
16816 std::swap (operands
[0], operands
[2]);
16817 std::swap (operands
[1], operands
[3]);
16821 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16822 comparison between the two. */
16824 aarch64_host_wide_int_compare (const void *x
, const void *y
)
16826 return wi::cmps (* ((const HOST_WIDE_INT
*) x
),
16827 * ((const HOST_WIDE_INT
*) y
));
16830 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16831 other pointing to a REG rtx containing an offset, compare the offsets
16836 1 iff offset (X) > offset (Y)
16837 0 iff offset (X) == offset (Y)
16838 -1 iff offset (X) < offset (Y) */
16840 aarch64_ldrstr_offset_compare (const void *x
, const void *y
)
16842 const rtx
* operands_1
= (const rtx
*) x
;
16843 const rtx
* operands_2
= (const rtx
*) y
;
16844 rtx mem_1
, mem_2
, base
, offset_1
, offset_2
;
16846 if (MEM_P (operands_1
[0]))
16847 mem_1
= operands_1
[0];
16849 mem_1
= operands_1
[1];
16851 if (MEM_P (operands_2
[0]))
16852 mem_2
= operands_2
[0];
16854 mem_2
= operands_2
[1];
16856 /* Extract the offsets. */
16857 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
16858 extract_base_offset_in_addr (mem_2
, &base
, &offset_2
);
16860 gcc_assert (offset_1
!= NULL_RTX
&& offset_2
!= NULL_RTX
);
16862 return wi::cmps (INTVAL (offset_1
), INTVAL (offset_2
));
16865 /* Given OPERANDS of consecutive load/store, check if we can merge
16866 them into ldp/stp by adjusting the offset. LOAD is true if they
16867 are load instructions. MODE is the mode of memory operands.
16869 Given below consecutive stores:
16871 str w1, [xb, 0x100]
16872 str w1, [xb, 0x104]
16873 str w1, [xb, 0x108]
16874 str w1, [xb, 0x10c]
16876 Though the offsets are out of the range supported by stp, we can
16877 still pair them after adjusting the offset, like:
16879 add scratch, xb, 0x100
16880 stp w1, w1, [scratch]
16881 stp w1, w1, [scratch, 0x8]
16883 The peephole patterns detecting this opportunity should guarantee
16884 the scratch register is avaliable. */
16887 aarch64_operands_adjust_ok_for_ldpstp (rtx
*operands
, bool load
,
16890 const int num_insns
= 4;
16891 enum reg_class rclass
;
16892 HOST_WIDE_INT offvals
[num_insns
], msize
;
16893 rtx mem
[num_insns
], reg
[num_insns
], base
[num_insns
], offset
[num_insns
];
16897 for (int i
= 0; i
< num_insns
; i
++)
16899 reg
[i
] = operands
[2 * i
];
16900 mem
[i
] = operands
[2 * i
+ 1];
16902 gcc_assert (REG_P (reg
[i
]));
16905 /* Do not attempt to merge the loads if the loads clobber each other. */
16906 for (int i
= 0; i
< 8; i
+= 2)
16907 for (int j
= i
+ 2; j
< 8; j
+= 2)
16908 if (reg_overlap_mentioned_p (operands
[i
], operands
[j
]))
16912 for (int i
= 0; i
< num_insns
; i
++)
16914 mem
[i
] = operands
[2 * i
];
16915 reg
[i
] = operands
[2 * i
+ 1];
16918 /* Skip if memory operand is by itself valid for ldp/stp. */
16919 if (!MEM_P (mem
[0]) || aarch64_mem_pair_operand (mem
[0], mode
))
16922 for (int i
= 0; i
< num_insns
; i
++)
16924 /* The mems cannot be volatile. */
16925 if (MEM_VOLATILE_P (mem
[i
]))
16928 /* Check if the addresses are in the form of [base+offset]. */
16929 extract_base_offset_in_addr (mem
[i
], base
+ i
, offset
+ i
);
16930 if (base
[i
] == NULL_RTX
|| offset
[i
] == NULL_RTX
)
16934 /* Check if the registers are of same class. */
16935 rclass
= REG_P (reg
[0]) && FP_REGNUM_P (REGNO (reg
[0]))
16936 ? FP_REGS
: GENERAL_REGS
;
16938 for (int i
= 1; i
< num_insns
; i
++)
16939 if (REG_P (reg
[i
]) && FP_REGNUM_P (REGNO (reg
[i
])))
16941 if (rclass
!= FP_REGS
)
16946 if (rclass
!= GENERAL_REGS
)
16950 /* Only the last register in the order in which they occur
16951 may be clobbered by the load. */
16952 if (rclass
== GENERAL_REGS
&& load
)
16953 for (int i
= 0; i
< num_insns
- 1; i
++)
16954 if (reg_mentioned_p (reg
[i
], mem
[i
]))
16957 /* Check if the bases are same. */
16958 for (int i
= 0; i
< num_insns
- 1; i
++)
16959 if (!rtx_equal_p (base
[i
], base
[i
+ 1]))
16962 for (int i
= 0; i
< num_insns
; i
++)
16963 offvals
[i
] = INTVAL (offset
[i
]);
16965 msize
= GET_MODE_SIZE (mode
);
16967 /* Check if the offsets can be put in the right order to do a ldp/stp. */
16968 qsort (offvals
, num_insns
, sizeof (HOST_WIDE_INT
),
16969 aarch64_host_wide_int_compare
);
16971 if (!(offvals
[1] == offvals
[0] + msize
16972 && offvals
[3] == offvals
[2] + msize
))
16975 /* Check that offsets are within range of each other. The ldp/stp
16976 instructions have 7 bit immediate offsets, so use 0x80. */
16977 if (offvals
[2] - offvals
[0] >= msize
* 0x80)
16980 /* The offsets must be aligned with respect to each other. */
16981 if (offvals
[0] % msize
!= offvals
[2] % msize
)
16984 /* If we have SImode and slow unaligned ldp,
16985 check the alignment to be at least 8 byte. */
16987 && (aarch64_tune_params
.extra_tuning_flags
16988 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
)
16990 && MEM_ALIGN (mem
[0]) < 8 * BITS_PER_UNIT
)
16996 /* Given OPERANDS of consecutive load/store, this function pairs them
16997 into LDP/STP after adjusting the offset. It depends on the fact
16998 that the operands can be sorted so the offsets are correct for STP.
16999 MODE is the mode of memory operands. CODE is the rtl operator
17000 which should be applied to all memory operands, it's SIGN_EXTEND,
17001 ZERO_EXTEND or UNKNOWN. */
17004 aarch64_gen_adjusted_ldpstp (rtx
*operands
, bool load
,
17005 scalar_mode mode
, RTX_CODE code
)
17007 rtx base
, offset_1
, offset_3
, t1
, t2
;
17008 rtx mem_1
, mem_2
, mem_3
, mem_4
;
17009 rtx temp_operands
[8];
17010 HOST_WIDE_INT off_val_1
, off_val_3
, base_off
, new_off_1
, new_off_3
,
17011 stp_off_upper_limit
, stp_off_lower_limit
, msize
;
17013 /* We make changes on a copy as we may still bail out. */
17014 for (int i
= 0; i
< 8; i
++)
17015 temp_operands
[i
] = operands
[i
];
17017 /* Sort the operands. */
17018 qsort (temp_operands
, 4, 2 * sizeof (rtx
*), aarch64_ldrstr_offset_compare
);
17022 mem_1
= temp_operands
[1];
17023 mem_2
= temp_operands
[3];
17024 mem_3
= temp_operands
[5];
17025 mem_4
= temp_operands
[7];
17029 mem_1
= temp_operands
[0];
17030 mem_2
= temp_operands
[2];
17031 mem_3
= temp_operands
[4];
17032 mem_4
= temp_operands
[6];
17033 gcc_assert (code
== UNKNOWN
);
17036 extract_base_offset_in_addr (mem_1
, &base
, &offset_1
);
17037 extract_base_offset_in_addr (mem_3
, &base
, &offset_3
);
17038 gcc_assert (base
!= NULL_RTX
&& offset_1
!= NULL_RTX
17039 && offset_3
!= NULL_RTX
);
17041 /* Adjust offset so it can fit in LDP/STP instruction. */
17042 msize
= GET_MODE_SIZE (mode
);
17043 stp_off_upper_limit
= msize
* (0x40 - 1);
17044 stp_off_lower_limit
= - msize
* 0x40;
17046 off_val_1
= INTVAL (offset_1
);
17047 off_val_3
= INTVAL (offset_3
);
17049 /* The base offset is optimally half way between the two STP/LDP offsets. */
17051 base_off
= (off_val_1
+ off_val_3
) / 2;
17053 /* However, due to issues with negative LDP/STP offset generation for
17054 larger modes, for DF, DI and vector modes. we must not use negative
17055 addresses smaller than 9 signed unadjusted bits can store. This
17056 provides the most range in this case. */
17057 base_off
= off_val_1
;
17059 /* Adjust the base so that it is aligned with the addresses but still
17061 if (base_off
% msize
!= off_val_1
% msize
)
17062 /* Fix the offset, bearing in mind we want to make it bigger not
17064 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17065 else if (msize
<= 4)
17066 /* The negative range of LDP/STP is one larger than the positive range. */
17069 /* Check if base offset is too big or too small. We can attempt to resolve
17070 this issue by setting it to the maximum value and seeing if the offsets
17072 if (base_off
>= 0x1000)
17074 base_off
= 0x1000 - 1;
17075 /* We must still make sure that the base offset is aligned with respect
17076 to the address. But it may may not be made any bigger. */
17077 base_off
-= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17080 /* Likewise for the case where the base is too small. */
17081 if (base_off
<= -0x1000)
17083 base_off
= -0x1000 + 1;
17084 base_off
+= (((base_off
% msize
) - (off_val_1
% msize
)) + msize
) % msize
;
17087 /* Offset of the first STP/LDP. */
17088 new_off_1
= off_val_1
- base_off
;
17090 /* Offset of the second STP/LDP. */
17091 new_off_3
= off_val_3
- base_off
;
17093 /* The offsets must be within the range of the LDP/STP instructions. */
17094 if (new_off_1
> stp_off_upper_limit
|| new_off_1
< stp_off_lower_limit
17095 || new_off_3
> stp_off_upper_limit
|| new_off_3
< stp_off_lower_limit
)
17098 replace_equiv_address_nv (mem_1
, plus_constant (Pmode
, operands
[8],
17100 replace_equiv_address_nv (mem_2
, plus_constant (Pmode
, operands
[8],
17101 new_off_1
+ msize
), true);
17102 replace_equiv_address_nv (mem_3
, plus_constant (Pmode
, operands
[8],
17104 replace_equiv_address_nv (mem_4
, plus_constant (Pmode
, operands
[8],
17105 new_off_3
+ msize
), true);
17107 if (!aarch64_mem_pair_operand (mem_1
, mode
)
17108 || !aarch64_mem_pair_operand (mem_3
, mode
))
17111 if (code
== ZERO_EXTEND
)
17113 mem_1
= gen_rtx_ZERO_EXTEND (DImode
, mem_1
);
17114 mem_2
= gen_rtx_ZERO_EXTEND (DImode
, mem_2
);
17115 mem_3
= gen_rtx_ZERO_EXTEND (DImode
, mem_3
);
17116 mem_4
= gen_rtx_ZERO_EXTEND (DImode
, mem_4
);
17118 else if (code
== SIGN_EXTEND
)
17120 mem_1
= gen_rtx_SIGN_EXTEND (DImode
, mem_1
);
17121 mem_2
= gen_rtx_SIGN_EXTEND (DImode
, mem_2
);
17122 mem_3
= gen_rtx_SIGN_EXTEND (DImode
, mem_3
);
17123 mem_4
= gen_rtx_SIGN_EXTEND (DImode
, mem_4
);
17128 operands
[0] = temp_operands
[0];
17129 operands
[1] = mem_1
;
17130 operands
[2] = temp_operands
[2];
17131 operands
[3] = mem_2
;
17132 operands
[4] = temp_operands
[4];
17133 operands
[5] = mem_3
;
17134 operands
[6] = temp_operands
[6];
17135 operands
[7] = mem_4
;
17139 operands
[0] = mem_1
;
17140 operands
[1] = temp_operands
[1];
17141 operands
[2] = mem_2
;
17142 operands
[3] = temp_operands
[3];
17143 operands
[4] = mem_3
;
17144 operands
[5] = temp_operands
[5];
17145 operands
[6] = mem_4
;
17146 operands
[7] = temp_operands
[7];
17149 /* Emit adjusting instruction. */
17150 emit_insn (gen_rtx_SET (operands
[8], plus_constant (DImode
, base
, base_off
)));
17151 /* Emit ldp/stp instructions. */
17152 t1
= gen_rtx_SET (operands
[0], operands
[1]);
17153 t2
= gen_rtx_SET (operands
[2], operands
[3]);
17154 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17155 t1
= gen_rtx_SET (operands
[4], operands
[5]);
17156 t2
= gen_rtx_SET (operands
[6], operands
[7]);
17157 emit_insn (gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (2, t1
, t2
)));
17161 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17162 it isn't worth branching around empty masked ops (including masked
17166 aarch64_empty_mask_is_expensive (unsigned)
17171 /* Return 1 if pseudo register should be created and used to hold
17172 GOT address for PIC code. */
17175 aarch64_use_pseudo_pic_reg (void)
17177 return aarch64_cmodel
== AARCH64_CMODEL_SMALL_SPIC
;
17180 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17183 aarch64_unspec_may_trap_p (const_rtx x
, unsigned flags
)
17185 switch (XINT (x
, 1))
17187 case UNSPEC_GOTSMALLPIC
:
17188 case UNSPEC_GOTSMALLPIC28K
:
17189 case UNSPEC_GOTTINYPIC
:
17195 return default_unspec_may_trap_p (x
, flags
);
17199 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17200 return the log2 of that value. Otherwise return -1. */
17203 aarch64_fpconst_pow_of_2 (rtx x
)
17205 const REAL_VALUE_TYPE
*r
;
17207 if (!CONST_DOUBLE_P (x
))
17210 r
= CONST_DOUBLE_REAL_VALUE (x
);
17212 if (REAL_VALUE_NEGATIVE (*r
)
17213 || REAL_VALUE_ISNAN (*r
)
17214 || REAL_VALUE_ISINF (*r
)
17215 || !real_isinteger (r
, DFmode
))
17218 return exact_log2 (real_to_integer (r
));
17221 /* If X is a vector of equal CONST_DOUBLE values and that value is
17222 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17225 aarch64_vec_fpconst_pow_of_2 (rtx x
)
17228 if (GET_CODE (x
) != CONST_VECTOR
17229 || !CONST_VECTOR_NUNITS (x
).is_constant (&nelts
))
17232 if (GET_MODE_CLASS (GET_MODE (x
)) != MODE_VECTOR_FLOAT
)
17235 int firstval
= aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, 0));
17239 for (int i
= 1; i
< nelts
; i
++)
17240 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x
, i
)) != firstval
)
17246 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17249 __fp16 always promotes through this hook.
17250 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17251 through the generic excess precision logic rather than here. */
17254 aarch64_promoted_type (const_tree t
)
17256 if (SCALAR_FLOAT_TYPE_P (t
)
17257 && TYPE_MAIN_VARIANT (t
) == aarch64_fp16_type_node
)
17258 return float_type_node
;
17263 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17266 aarch64_optab_supported_p (int op
, machine_mode mode1
, machine_mode
,
17267 optimization_type opt_type
)
17272 return opt_type
== OPTIMIZE_FOR_SPEED
&& use_rsqrt_p (mode1
);
17279 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17281 static unsigned int
17282 aarch64_dwarf_poly_indeterminate_value (unsigned int i
, unsigned int *factor
,
17285 /* Polynomial invariant 1 == (VG / 2) - 1. */
17286 gcc_assert (i
== 1);
17289 return AARCH64_DWARF_VG
;
17292 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17293 if MODE is HFmode, and punt to the generic implementation otherwise. */
17296 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode
)
17298 return (mode
== HFmode
17300 : default_libgcc_floating_mode_supported_p (mode
));
17303 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17304 if MODE is HFmode, and punt to the generic implementation otherwise. */
17307 aarch64_scalar_mode_supported_p (scalar_mode mode
)
17309 return (mode
== HFmode
17311 : default_scalar_mode_supported_p (mode
));
17314 /* Set the value of FLT_EVAL_METHOD.
17315 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17317 0: evaluate all operations and constants, whose semantic type has at
17318 most the range and precision of type float, to the range and
17319 precision of float; evaluate all other operations and constants to
17320 the range and precision of the semantic type;
17322 N, where _FloatN is a supported interchange floating type
17323 evaluate all operations and constants, whose semantic type has at
17324 most the range and precision of _FloatN type, to the range and
17325 precision of the _FloatN type; evaluate all other operations and
17326 constants to the range and precision of the semantic type;
17328 If we have the ARMv8.2-A extensions then we support _Float16 in native
17329 precision, so we should set this to 16. Otherwise, we support the type,
17330 but want to evaluate expressions in float precision, so set this to
17333 static enum flt_eval_method
17334 aarch64_excess_precision (enum excess_precision_type type
)
17338 case EXCESS_PRECISION_TYPE_FAST
:
17339 case EXCESS_PRECISION_TYPE_STANDARD
:
17340 /* We can calculate either in 16-bit range and precision or
17341 32-bit range and precision. Make that decision based on whether
17342 we have native support for the ARMv8.2-A 16-bit floating-point
17343 instructions or not. */
17344 return (TARGET_FP_F16INST
17345 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17346 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
);
17347 case EXCESS_PRECISION_TYPE_IMPLICIT
:
17348 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
;
17350 gcc_unreachable ();
17352 return FLT_EVAL_METHOD_UNPREDICTABLE
;
17355 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17356 scheduled for speculative execution. Reject the long-running division
17357 and square-root instructions. */
17360 aarch64_sched_can_speculate_insn (rtx_insn
*insn
)
17362 switch (get_attr_type (insn
))
17370 case TYPE_NEON_FP_SQRT_S
:
17371 case TYPE_NEON_FP_SQRT_D
:
17372 case TYPE_NEON_FP_SQRT_S_Q
:
17373 case TYPE_NEON_FP_SQRT_D_Q
:
17374 case TYPE_NEON_FP_DIV_S
:
17375 case TYPE_NEON_FP_DIV_D
:
17376 case TYPE_NEON_FP_DIV_S_Q
:
17377 case TYPE_NEON_FP_DIV_D_Q
:
17384 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17387 aarch64_compute_pressure_classes (reg_class
*classes
)
17390 classes
[i
++] = GENERAL_REGS
;
17391 classes
[i
++] = FP_REGS
;
17392 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17393 registers need to go in PR_LO_REGS at some point during their
17394 lifetime. Splitting it into two halves has the effect of making
17395 all predicates count against PR_LO_REGS, so that we try whenever
17396 possible to restrict the number of live predicates to 8. This
17397 greatly reduces the amount of spilling in certain loops. */
17398 classes
[i
++] = PR_LO_REGS
;
17399 classes
[i
++] = PR_HI_REGS
;
17403 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17406 aarch64_can_change_mode_class (machine_mode from
,
17407 machine_mode to
, reg_class_t
)
17409 if (BYTES_BIG_ENDIAN
)
17411 bool from_sve_p
= aarch64_sve_data_mode_p (from
);
17412 bool to_sve_p
= aarch64_sve_data_mode_p (to
);
17414 /* Don't allow changes between SVE data modes and non-SVE modes.
17415 See the comment at the head of aarch64-sve.md for details. */
17416 if (from_sve_p
!= to_sve_p
)
17419 /* Don't allow changes in element size: lane 0 of the new vector
17420 would not then be lane 0 of the old vector. See the comment
17421 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17424 In the worst case, this forces a register to be spilled in
17425 one mode and reloaded in the other, which handles the
17426 endianness correctly. */
17427 if (from_sve_p
&& GET_MODE_UNIT_SIZE (from
) != GET_MODE_UNIT_SIZE (to
))
17433 /* Implement TARGET_EARLY_REMAT_MODES. */
17436 aarch64_select_early_remat_modes (sbitmap modes
)
17438 /* SVE values are not normally live across a call, so it should be
17439 worth doing early rematerialization even in VL-specific mode. */
17440 for (int i
= 0; i
< NUM_MACHINE_MODES
; ++i
)
17442 machine_mode mode
= (machine_mode
) i
;
17443 unsigned int vec_flags
= aarch64_classify_vector_mode (mode
);
17444 if (vec_flags
& VEC_ANY_SVE
)
17445 bitmap_set_bit (modes
, i
);
17449 /* Override the default target speculation_safe_value. */
17451 aarch64_speculation_safe_value (machine_mode mode
,
17452 rtx result
, rtx val
, rtx failval
)
17454 /* Maybe we should warn if falling back to hard barriers. They are
17455 likely to be noticably more expensive than the alternative below. */
17456 if (!aarch64_track_speculation
)
17457 return default_speculation_safe_value (mode
, result
, val
, failval
);
17460 val
= copy_to_mode_reg (mode
, val
);
17462 if (!aarch64_reg_or_zero (failval
, mode
))
17463 failval
= copy_to_mode_reg (mode
, failval
);
17468 emit_insn (gen_despeculate_copyqi (result
, val
, failval
));
17471 emit_insn (gen_despeculate_copyhi (result
, val
, failval
));
17474 emit_insn (gen_despeculate_copysi (result
, val
, failval
));
17477 emit_insn (gen_despeculate_copydi (result
, val
, failval
));
17480 emit_insn (gen_despeculate_copyti (result
, val
, failval
));
17483 gcc_unreachable ();
17488 /* Target-specific selftests. */
17492 namespace selftest
{
17494 /* Selftest for the RTL loader.
17495 Verify that the RTL loader copes with a dump from
17496 print_rtx_function. This is essentially just a test that class
17497 function_reader can handle a real dump, but it also verifies
17498 that lookup_reg_by_dump_name correctly handles hard regs.
17499 The presence of hard reg names in the dump means that the test is
17500 target-specific, hence it is in this file. */
17503 aarch64_test_loading_full_dump ()
17505 rtl_dump_test
t (SELFTEST_LOCATION
, locate_file ("aarch64/times-two.rtl"));
17507 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun
->decl
)));
17509 rtx_insn
*insn_1
= get_insn_by_uid (1);
17510 ASSERT_EQ (NOTE
, GET_CODE (insn_1
));
17512 rtx_insn
*insn_15
= get_insn_by_uid (15);
17513 ASSERT_EQ (INSN
, GET_CODE (insn_15
));
17514 ASSERT_EQ (USE
, GET_CODE (PATTERN (insn_15
)));
17516 /* Verify crtl->return_rtx. */
17517 ASSERT_EQ (REG
, GET_CODE (crtl
->return_rtx
));
17518 ASSERT_EQ (0, REGNO (crtl
->return_rtx
));
17519 ASSERT_EQ (SImode
, GET_MODE (crtl
->return_rtx
));
17522 /* Run all target-specific selftests. */
17525 aarch64_run_selftests (void)
17527 aarch64_test_loading_full_dump ();
17530 } // namespace selftest
17532 #endif /* #if CHECKING_P */
17534 #undef TARGET_ADDRESS_COST
17535 #define TARGET_ADDRESS_COST aarch64_address_cost
17537 /* This hook will determines whether unnamed bitfields affect the alignment
17538 of the containing structure. The hook returns true if the structure
17539 should inherit the alignment requirements of an unnamed bitfield's
17541 #undef TARGET_ALIGN_ANON_BITFIELD
17542 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17544 #undef TARGET_ASM_ALIGNED_DI_OP
17545 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17547 #undef TARGET_ASM_ALIGNED_HI_OP
17548 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17550 #undef TARGET_ASM_ALIGNED_SI_OP
17551 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17553 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17554 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17555 hook_bool_const_tree_hwi_hwi_const_tree_true
17557 #undef TARGET_ASM_FILE_START
17558 #define TARGET_ASM_FILE_START aarch64_start_file
17560 #undef TARGET_ASM_OUTPUT_MI_THUNK
17561 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17563 #undef TARGET_ASM_SELECT_RTX_SECTION
17564 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17566 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17567 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17569 #undef TARGET_BUILD_BUILTIN_VA_LIST
17570 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17572 #undef TARGET_CALLEE_COPIES
17573 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17575 #undef TARGET_CAN_ELIMINATE
17576 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17578 #undef TARGET_CAN_INLINE_P
17579 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17581 #undef TARGET_CANNOT_FORCE_CONST_MEM
17582 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17584 #undef TARGET_CASE_VALUES_THRESHOLD
17585 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17587 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17588 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17590 /* Only the least significant bit is used for initialization guard
17592 #undef TARGET_CXX_GUARD_MASK_BIT
17593 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17595 #undef TARGET_C_MODE_FOR_SUFFIX
17596 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17598 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17599 #undef TARGET_DEFAULT_TARGET_FLAGS
17600 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17603 #undef TARGET_CLASS_MAX_NREGS
17604 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17606 #undef TARGET_BUILTIN_DECL
17607 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17609 #undef TARGET_BUILTIN_RECIPROCAL
17610 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17612 #undef TARGET_C_EXCESS_PRECISION
17613 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17615 #undef TARGET_EXPAND_BUILTIN
17616 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17618 #undef TARGET_EXPAND_BUILTIN_VA_START
17619 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17621 #undef TARGET_FOLD_BUILTIN
17622 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17624 #undef TARGET_FUNCTION_ARG
17625 #define TARGET_FUNCTION_ARG aarch64_function_arg
17627 #undef TARGET_FUNCTION_ARG_ADVANCE
17628 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17630 #undef TARGET_FUNCTION_ARG_BOUNDARY
17631 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17633 #undef TARGET_FUNCTION_ARG_PADDING
17634 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17636 #undef TARGET_GET_RAW_RESULT_MODE
17637 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17638 #undef TARGET_GET_RAW_ARG_MODE
17639 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17641 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17642 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17644 #undef TARGET_FUNCTION_VALUE
17645 #define TARGET_FUNCTION_VALUE aarch64_function_value
17647 #undef TARGET_FUNCTION_VALUE_REGNO_P
17648 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17650 #undef TARGET_GIMPLE_FOLD_BUILTIN
17651 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17653 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17654 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17656 #undef TARGET_INIT_BUILTINS
17657 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17659 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17660 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17661 aarch64_ira_change_pseudo_allocno_class
17663 #undef TARGET_LEGITIMATE_ADDRESS_P
17664 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17666 #undef TARGET_LEGITIMATE_CONSTANT_P
17667 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17669 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17670 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17671 aarch64_legitimize_address_displacement
17673 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17674 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17676 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17677 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17678 aarch64_libgcc_floating_mode_supported_p
17680 #undef TARGET_MANGLE_TYPE
17681 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17683 #undef TARGET_MEMORY_MOVE_COST
17684 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17686 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17687 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17689 #undef TARGET_MUST_PASS_IN_STACK
17690 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17692 /* This target hook should return true if accesses to volatile bitfields
17693 should use the narrowest mode possible. It should return false if these
17694 accesses should use the bitfield container type. */
17695 #undef TARGET_NARROW_VOLATILE_BITFIELD
17696 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17698 #undef TARGET_OPTION_OVERRIDE
17699 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17701 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17702 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17703 aarch64_override_options_after_change
17705 #undef TARGET_OPTION_SAVE
17706 #define TARGET_OPTION_SAVE aarch64_option_save
17708 #undef TARGET_OPTION_RESTORE
17709 #define TARGET_OPTION_RESTORE aarch64_option_restore
17711 #undef TARGET_OPTION_PRINT
17712 #define TARGET_OPTION_PRINT aarch64_option_print
17714 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17715 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17717 #undef TARGET_SET_CURRENT_FUNCTION
17718 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17720 #undef TARGET_PASS_BY_REFERENCE
17721 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17723 #undef TARGET_PREFERRED_RELOAD_CLASS
17724 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17726 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17727 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17729 #undef TARGET_PROMOTED_TYPE
17730 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17732 #undef TARGET_SECONDARY_RELOAD
17733 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17735 #undef TARGET_SHIFT_TRUNCATION_MASK
17736 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17738 #undef TARGET_SETUP_INCOMING_VARARGS
17739 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17741 #undef TARGET_STRUCT_VALUE_RTX
17742 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17744 #undef TARGET_REGISTER_MOVE_COST
17745 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17747 #undef TARGET_RETURN_IN_MEMORY
17748 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17750 #undef TARGET_RETURN_IN_MSB
17751 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17753 #undef TARGET_RTX_COSTS
17754 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17756 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17757 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17759 #undef TARGET_SCHED_ISSUE_RATE
17760 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17762 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17763 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17764 aarch64_sched_first_cycle_multipass_dfa_lookahead
17766 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17767 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17768 aarch64_first_cycle_multipass_dfa_lookahead_guard
17770 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17771 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17772 aarch64_get_separate_components
17774 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17775 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17776 aarch64_components_for_bb
17778 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17779 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17780 aarch64_disqualify_components
17782 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17783 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17784 aarch64_emit_prologue_components
17786 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17787 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17788 aarch64_emit_epilogue_components
17790 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17791 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17792 aarch64_set_handled_components
17794 #undef TARGET_TRAMPOLINE_INIT
17795 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17797 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17798 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17800 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17801 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17803 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17804 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17805 aarch64_builtin_support_vector_misalignment
17807 #undef TARGET_ARRAY_MODE
17808 #define TARGET_ARRAY_MODE aarch64_array_mode
17810 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17811 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17813 #undef TARGET_VECTORIZE_ADD_STMT_COST
17814 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17816 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17817 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17818 aarch64_builtin_vectorization_cost
17820 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17821 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17823 #undef TARGET_VECTORIZE_BUILTINS
17824 #define TARGET_VECTORIZE_BUILTINS
17826 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17827 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17828 aarch64_builtin_vectorized_function
17830 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17831 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17832 aarch64_autovectorize_vector_sizes
17834 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17835 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17836 aarch64_atomic_assign_expand_fenv
17838 /* Section anchor support. */
17840 #undef TARGET_MIN_ANCHOR_OFFSET
17841 #define TARGET_MIN_ANCHOR_OFFSET -256
17843 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17844 byte offset; we can do much more for larger data types, but have no way
17845 to determine the size of the access. We assume accesses are aligned. */
17846 #undef TARGET_MAX_ANCHOR_OFFSET
17847 #define TARGET_MAX_ANCHOR_OFFSET 4095
17849 #undef TARGET_VECTOR_ALIGNMENT
17850 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17852 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17853 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17854 aarch64_vectorize_preferred_vector_alignment
17855 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17856 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17857 aarch64_simd_vector_alignment_reachable
17859 /* vec_perm support. */
17861 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17862 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17863 aarch64_vectorize_vec_perm_const
17865 #undef TARGET_VECTORIZE_GET_MASK_MODE
17866 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17867 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17868 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17869 aarch64_empty_mask_is_expensive
17870 #undef TARGET_PREFERRED_ELSE_VALUE
17871 #define TARGET_PREFERRED_ELSE_VALUE \
17872 aarch64_preferred_else_value
17874 #undef TARGET_INIT_LIBFUNCS
17875 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17877 #undef TARGET_FIXED_CONDITION_CODE_REGS
17878 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17880 #undef TARGET_FLAGS_REGNUM
17881 #define TARGET_FLAGS_REGNUM CC_REGNUM
17883 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17884 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17886 #undef TARGET_ASAN_SHADOW_OFFSET
17887 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17889 #undef TARGET_LEGITIMIZE_ADDRESS
17890 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17892 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17893 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17895 #undef TARGET_CAN_USE_DOLOOP_P
17896 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17898 #undef TARGET_SCHED_ADJUST_PRIORITY
17899 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17901 #undef TARGET_SCHED_MACRO_FUSION_P
17902 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17904 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17905 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17907 #undef TARGET_SCHED_FUSION_PRIORITY
17908 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17910 #undef TARGET_UNSPEC_MAY_TRAP_P
17911 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17913 #undef TARGET_USE_PSEUDO_PIC_REG
17914 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17916 #undef TARGET_PRINT_OPERAND
17917 #define TARGET_PRINT_OPERAND aarch64_print_operand
17919 #undef TARGET_PRINT_OPERAND_ADDRESS
17920 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17922 #undef TARGET_OPTAB_SUPPORTED_P
17923 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17925 #undef TARGET_OMIT_STRUCT_RETURN_REG
17926 #define TARGET_OMIT_STRUCT_RETURN_REG true
17928 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17929 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17930 aarch64_dwarf_poly_indeterminate_value
17932 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17933 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17934 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17936 #undef TARGET_HARD_REGNO_NREGS
17937 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17938 #undef TARGET_HARD_REGNO_MODE_OK
17939 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17941 #undef TARGET_MODES_TIEABLE_P
17942 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17944 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17945 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17946 aarch64_hard_regno_call_part_clobbered
17948 #undef TARGET_CONSTANT_ALIGNMENT
17949 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17951 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17952 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17954 #undef TARGET_CAN_CHANGE_MODE_CLASS
17955 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17957 #undef TARGET_SELECT_EARLY_REMAT_MODES
17958 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17960 #undef TARGET_SPECULATION_SAFE_VALUE
17961 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
17964 #undef TARGET_RUN_TARGET_SELFTESTS
17965 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17966 #endif /* #if CHECKING_P */
17968 struct gcc_target targetm
= TARGET_INITIALIZER
;
17970 #include "gt-aarch64.h"